Make line endings consistently LFtags/v0.3.22^2
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(dzamax) | |||
| #else | |||
| #define AMAX BLASFUNC(scamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMAX BLASFUNC(damax) | |||
| #else | |||
| #define AMAX BLASFUNC(samax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMAX(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,137 +1,137 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef AMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(dzamin) | |||
| #else | |||
| #define AMIN BLASFUNC(scamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define AMIN BLASFUNC(damin) | |||
| #else | |||
| #define AMIN BLASFUNC(samin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x = 1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) | |||
| { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) | |||
| { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) | |||
| { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) | |||
| { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) | |||
| { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| AMIN(&m, x, &inc_x); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,134 +1,134 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HBMV | |||
| #ifdef DOUBLE | |||
| #define HBMV BLASFUNC(zhbmv) | |||
| #else | |||
| #define HBMV BLASFUNC(chbmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| blasint k = 1; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_K"))) k = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' k = %d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, uplo, k, inc_x, inc_y, loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)(2 * k + 1) * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,133 +1,133 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef HPMV | |||
| #ifdef DOUBLE | |||
| #define HPMV BLASFUNC(zhpmv) | |||
| #else | |||
| #define HPMV BLASFUNC(chpmv) | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1, inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m, (int)m); | |||
| for(j = 0; j < m; j++) { | |||
| for(i = 0; i < m * COMPSIZE; i++) { | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,120 +1,120 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IAMIN | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(izamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(icamin) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMIN BLASFUNC(idamin) | |||
| #else | |||
| #define IAMIN BLASFUNC(isamin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMAX BLASFUNC(idmax) | |||
| #else | |||
| #define IMAX BLASFUNC(ismax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,114 +1,114 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef IMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IMIN BLASFUNC(idmin) | |||
| #else | |||
| #define IMIN BLASFUNC(ismin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| IMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMAX | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMAX BLASFUNC(dmax) | |||
| #else | |||
| #define NAMAX BLASFUNC(smax) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMAX (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,113 +1,113 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef NAMIN | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NAMIN BLASFUNC(dmin) | |||
| #else | |||
| #define NAMIN BLASFUNC(smin) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| NAMIN (&m, x, &inc_x); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,138 +1,138 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR | |||
| CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE | |||
| GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |||
| HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |||
| LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF | |||
| THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef ROTM | |||
| #ifdef DOUBLE | |||
| #define ROTM BLASFUNC(drotm) | |||
| #else | |||
| #define ROTM BLASFUNC(srotm) | |||
| #endif | |||
| int main(int argc, char *argv[]) | |||
| { | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x = 1, inc_y = 1; | |||
| FLOAT param[5] = {1, 2.0, 3.0, 4.0, 5.0}; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1, timeg; | |||
| argc--; | |||
| argv++; | |||
| if (argc > 0) { | |||
| from = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| to = MAX(atol(*argv), from); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if (argc > 0) { | |||
| step = atol(*argv); | |||
| argc--; | |||
| argv++; | |||
| } | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) | |||
| loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) | |||
| inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) | |||
| inc_y = atoi(p); | |||
| fprintf( | |||
| stderr, | |||
| "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", | |||
| from, to, step, inc_x, inc_y, loops); | |||
| if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| if ((y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == | |||
| NULL) { | |||
| fprintf(stderr, "Out of Memory!!\n"); | |||
| exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for (m = from; m <= to; m += step) { | |||
| timeg = 0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) { | |||
| x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * COMPSIZE * abs(inc_y); i++) { | |||
| y[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; | |||
| } | |||
| for (l = 0; l < loops; l++) { | |||
| begin(); | |||
| ROTM(&m, x, &inc_x, y, &inc_y, param); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -1,146 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "bench.h" | |||
| #undef SPMV | |||
| #ifndef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(dspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(sspmv) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define SPMV BLASFUNC(zspmv) | |||
| #else | |||
| #define SPMV BLASFUNC(cspmv) | |||
| #endif | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *x, *y; | |||
| FLOAT alpha[] = {1.0, 1.0}; | |||
| FLOAT beta [] = {1.0, 1.0}; | |||
| char uplo='L'; | |||
| blasint m, i, j; | |||
| blasint inc_x=1,inc_y=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,uplo,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef __linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)m); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| begin(); | |||
| SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -1,333 +1,333 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| stp s2, s3, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| stp s0, s1, [B00] | |||
| add B00, B00, #8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B00], #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Ldgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble .Ldgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_20 | |||
| .Ldgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L4_M4_60 | |||
| .Ldgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Ldgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble .Ldgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble .Ldgemm_ncopy_L1_BEGIN | |||
| .Ldgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_20 | |||
| .Ldgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L2_M4_60 | |||
| .Ldgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Ldgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Ldgemm_ncopy_L999 | |||
| .Ldgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_20 | |||
| .Ldgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble .Ldgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| .Ldgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne .Ldgemm_ncopy_L1_M4_60 | |||
| .Ldgemm_ncopy_L1_M4_END: | |||
| .Ldgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -1,293 +1,293 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * Abdelrauf(quickwritereader@gmail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs19 | |||
| #define alpha_i vs20 | |||
| #define save_permute_1 vs21 | |||
| #define permute_mask vs22 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,233 +1,233 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt one%= \n\t" | |||
| "two%=: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| #endif | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,249 +1,249 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,328 +1,328 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| #define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code | |||
| #if !defined(USE_MASK_PERMUTATIONS) | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgew %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ | |||
| __vector float result; | |||
| __asm__ ( | |||
| "vmrgow %0,%1,%2;\n" | |||
| : "=v" (result) | |||
| : "v" (a), | |||
| "v" (b) | |||
| : ); | |||
| return result; | |||
| } | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| #else | |||
| register __vector unsigned int static_index0 = {2,0,3,1}; | |||
| #endif | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0; | |||
| register __vector unsigned int static_index2=static_index0 +temp1; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| #endif | |||
| for(; i<n; i+=32 ){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| register __vector float t1=mvec_mergee(v0,v1); | |||
| register __vector float ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2= mvec_mergee(v2,v3); | |||
| register __vector float ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| #if defined(USE_MASK_PERMUTATIONS) | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| #else | |||
| t1=mvec_mergee(v0,v1); | |||
| ti=mvec_mergeo(v0,v1); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=mvec_mergee(v2,v3); | |||
| ti2=mvec_mergeo(v2,v3); | |||
| v1=t2+ti2; | |||
| t1=mvec_mergee(v4,v5); | |||
| ti=mvec_mergeo(v4,v5); | |||
| v2=t1+ti; //sum | |||
| t2=mvec_mergee(v6,v7); | |||
| ti2=mvec_mergeo(v6,v7); | |||
| v3=t2+ti2; | |||
| #endif | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v1,v0); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0; | |||
| FLOAT maxf = 0; | |||
| BLASLONG max = 0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| max = ciamax_kernel_32(n1, x, &maxf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| maxf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) > maxf ) | |||
| { | |||
| max = i; | |||
| maxf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,266 +1,266 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 32 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG ciamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| float first_min=CABS1(x,0); | |||
| register __vector float quadruple_values={first_min,first_min,first_min,first_min}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; | |||
| register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; | |||
| for(; i<n; i+=32){ | |||
| //absolute temporary complex vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| register __vector float t1=vec_perm(v0,v1,real_pack_mask); | |||
| register __vector float ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| register __vector float t2=vec_perm(v2,v3,real_pack_mask); | |||
| register __vector float ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements . lets compare them | |||
| v_ptrx+=8; | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for first 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indf0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| //absolute temporary complex vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //pack complex real and imaginary parts together to sum real+image | |||
| t1=vec_perm(v0,v1,real_pack_mask); | |||
| ti=vec_perm(v0,v1,image_pack_mask); | |||
| v0=t1+ti; //sum quadruple real with quadruple image | |||
| t2=vec_perm(v2,v3,real_pack_mask); | |||
| ti2=vec_perm(v2,v3,image_pack_mask); | |||
| v1=t2+ti2; | |||
| t1=vec_perm(v4,v5,real_pack_mask); | |||
| ti=vec_perm(v4,v5,image_pack_mask); | |||
| v2=t1+ti; //sum | |||
| t2=vec_perm(v6,v7,real_pack_mask); | |||
| ti2=vec_perm(v6,v7,image_pack_mask); | |||
| v3=t2+ti2; | |||
| // now we have 16 summed elements {from 16 to 31} . lets compare them | |||
| v_ptrx+=8; | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| ind2= vec_sel(static_index0,static_index1,r1); | |||
| v0=vec_sel(v0,v1,r1); | |||
| ind3= vec_sel(static_index2,static_index3,r2); | |||
| v1=vec_sel(v2,v3,r2); | |||
| //final cmp and select index and value for the second 16 values | |||
| r1=vec_cmpgt(v0,v1); | |||
| register __vector unsigned int indv0 = vec_sel(ind2,ind3,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| indv0+=temp1; //make index from 16->31 | |||
| //find final quadruple from 32 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( indf0,indv0,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //get asbolute index | |||
| ind2+=temp0; | |||
| //compare with old quadruple and update | |||
| r1=vec_cmpgt(quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r1); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r1); | |||
| temp0+=temp_add; | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0; | |||
| FLOAT minf; | |||
| BLASLONG min=0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(min); | |||
| if (inc_x == 1) { | |||
| minf = CABS1(x,0); //index will not be incremented | |||
| BLASLONG n1 = n & -32; | |||
| if (n1 > 0) { | |||
| min = ciamin_kernel_32(n1, x, &minf); | |||
| i = n1; | |||
| ix = n1 << 1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += 2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| inc_x2 = 2 * inc_x; | |||
| minf = CABS1(x,0); | |||
| ix += inc_x2; | |||
| i++; | |||
| while(i < n) | |||
| { | |||
| if( CABS1(x,ix) < minf ) | |||
| { | |||
| min = i; | |||
| minf = CABS1(x,ix); | |||
| } | |||
| ix += inc_x2; | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find maximum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param maxf (out) maximum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *maxf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=temp0;//{0,0,0,0} | |||
| register __vector float quadruple_values={0,0,0,0}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v1,v0); | |||
| register __vector bool int r2=vec_cmpgt(v3,v2); | |||
| register __vector bool int r3=vec_cmpgt(v5,v4); | |||
| register __vector bool int r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf1,vf0); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v1,v0); | |||
| r2=vec_cmpgt(v3,v2); | |||
| r3=vec_cmpgt(v5,v4); | |||
| r4=vec_cmpgt(v7,v6); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| r2=vec_cmpgt(v1,v0); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv1,vv0); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vv0,vf0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt(vv0,quadruple_values); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the maximum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2>a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4>a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *maxf=a1; | |||
| }else if(a3>a1){ | |||
| index=i1; | |||
| *maxf=a3; | |||
| }else{ | |||
| *maxf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| FLOAT maxf = 0.0; | |||
| BLASLONG max = 0; | |||
| if (n <= 0 || inc_x <= 0) return (max); | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| max = siamax_kernel_64(n1, x, &maxf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = i; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (max + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) > maxf) { | |||
| max = j + 1; | |||
| maxf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) > maxf) { | |||
| max = j + 2; | |||
| maxf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) > maxf) { | |||
| max = j + 3; | |||
| maxf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) > maxf) { | |||
| max = j; | |||
| maxf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (max + 1); | |||
| } | |||
| } | |||
| @@ -1,288 +1,288 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #include <altivec.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| /** | |||
| * Find minimum index | |||
| * Warning: requirements n>0 and n % 64 == 0 | |||
| * @param n | |||
| * @param x pointer to the vector | |||
| * @param minf (out) minimum absolute value .( only for output ) | |||
| * @return index | |||
| */ | |||
| static BLASLONG siamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *minf) { | |||
| BLASLONG index; | |||
| BLASLONG i=0; | |||
| register __vector unsigned int static_index0 = {0,1,2,3}; | |||
| register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register | |||
| register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} | |||
| register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; | |||
| register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; | |||
| register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; | |||
| temp0=vec_xor(temp0,temp0); | |||
| temp1=temp1 <<1 ; //{16,16,16,16} | |||
| register __vector unsigned int quadruple_indices=static_index0;//{0,1,2,3}; | |||
| register __vector float * v_ptrx=(__vector float *)x; | |||
| register __vector float quadruple_values=vec_abs(v_ptrx[0]); | |||
| for(; i<n; i+=64){ | |||
| //absolute temporary vectors | |||
| register __vector float v0=vec_abs(v_ptrx[0]); | |||
| register __vector float v1=vec_abs(v_ptrx[1]); | |||
| register __vector float v2=vec_abs(v_ptrx[2]); | |||
| register __vector float v3=vec_abs(v_ptrx[3]); | |||
| register __vector float v4=vec_abs(v_ptrx[4]); | |||
| register __vector float v5=vec_abs(v_ptrx[5]); | |||
| register __vector float v6=vec_abs(v_ptrx[6]); | |||
| register __vector float v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| register __vector bool int r1=vec_cmpgt(v0,v1); | |||
| register __vector bool int r2=vec_cmpgt(v2,v3); | |||
| register __vector bool int r3=vec_cmpgt(v4,v5); | |||
| register __vector bool int r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_first= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vf0= vec_sel(v0,v1,r1); | |||
| register __vector unsigned int ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vf1= vec_sel(v2,v3,r2); | |||
| register __vector unsigned int ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| register __vector unsigned int ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_first= vec_sel(ind0_first,ind1,r1); | |||
| vf0= vec_sel(vf0,vf1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vf1= vec_sel(v0,v1,r2); | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the first 32 values | |||
| r1=vec_cmpgt(vf0,vf1); | |||
| ind0_first = vec_sel(ind0_first,ind2,r1); | |||
| vf0= vec_sel(vf0,vf1,r1); | |||
| ind0_first+=temp0; //get absolute index | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| //second part of 32 | |||
| // absolute temporary vectors | |||
| v0=vec_abs(v_ptrx[0]); | |||
| v1=vec_abs(v_ptrx[1]); | |||
| v2=vec_abs(v_ptrx[2]); | |||
| v3=vec_abs(v_ptrx[3]); | |||
| v4=vec_abs(v_ptrx[4]); | |||
| v5=vec_abs(v_ptrx[5]); | |||
| v6=vec_abs(v_ptrx[6]); | |||
| v7=vec_abs(v_ptrx[7]); | |||
| //cmp quadruple pairs | |||
| r1=vec_cmpgt(v0,v1); | |||
| r2=vec_cmpgt(v2,v3); | |||
| r3=vec_cmpgt(v4,v5); | |||
| r4=vec_cmpgt(v6,v7); | |||
| //select | |||
| register __vector unsigned int ind0_second= vec_sel(static_index0,static_index1,r1); | |||
| register __vector float vv0= vec_sel(v0,v1,r1); | |||
| ind1= vec_sel(static_index2,static_index3,r2); | |||
| register __vector float vv1= vec_sel(v2,v3,r2); | |||
| ind2= vec_sel(static_index0,static_index1,r3); | |||
| v0=vec_sel(v4,v5,r3); | |||
| ind3= vec_sel(static_index2,static_index3,r4); | |||
| v1=vec_sel(v6,v7,r4); | |||
| // cmp selected | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| r2=vec_cmpgt(v0,v1); | |||
| v_ptrx+=8; | |||
| //select from above | |||
| ind0_second= vec_sel(ind0_second,ind1,r1); | |||
| vv0= vec_sel(vv0,vv1,r1) ; | |||
| ind2= vec_sel(ind2,ind3,r2); | |||
| vv1= vec_sel(v0,v1,r2) ; | |||
| //second indices actually should be within [16,31] so ind2+16 | |||
| ind2 +=temp1; | |||
| //final cmp and select index and value for the second 32 values | |||
| r1=vec_cmpgt(vv0,vv1); | |||
| ind0_second = vec_sel(ind0_second,ind2,r1); | |||
| vv0= vec_sel(vv0,vv1,r1); | |||
| ind0_second+=temp0; //get absolute index | |||
| //find final quadruple from 64 elements | |||
| r2=vec_cmpgt(vf0,vv0); | |||
| ind2 = vec_sel( ind0_first,ind0_second,r2); | |||
| vv0= vec_sel(vf0,vv0,r2); | |||
| //compare with old quadruple and update | |||
| r3=vec_cmpgt( quadruple_values,vv0); | |||
| quadruple_indices = vec_sel( quadruple_indices,ind2,r3); | |||
| quadruple_values= vec_sel(quadruple_values,vv0,r3); | |||
| temp0+=temp1; | |||
| temp0+=temp1; //temp0+32 | |||
| } | |||
| //now we have to chose from 4 values and 4 different indices | |||
| // we will compare pairwise if pairs are exactly the same we will choose minimum between index | |||
| // otherwise we will assign index of the minimum value | |||
| float a1,a2,a3,a4; | |||
| unsigned int i1,i2,i3,i4; | |||
| a1=vec_extract(quadruple_values,0); | |||
| a2=vec_extract(quadruple_values,1); | |||
| a3=vec_extract(quadruple_values,2); | |||
| a4=vec_extract(quadruple_values,3); | |||
| i1=vec_extract(quadruple_indices,0); | |||
| i2=vec_extract(quadruple_indices,1); | |||
| i3=vec_extract(quadruple_indices,2); | |||
| i4=vec_extract(quadruple_indices,3); | |||
| if(a1==a2){ | |||
| index=i1>i2?i2:i1; | |||
| }else if(a2<a1){ | |||
| index=i2; | |||
| a1=a2; | |||
| }else{ | |||
| index= i1; | |||
| } | |||
| if(a4==a3){ | |||
| i1=i3>i4?i4:i3; | |||
| }else if(a4<a3){ | |||
| i1=i4; | |||
| a3=a4; | |||
| }else{ | |||
| i1= i3; | |||
| } | |||
| if(a1==a3){ | |||
| index=i1>index?index:i1; | |||
| *minf=a1; | |||
| }else if(a3<a1){ | |||
| index=i1; | |||
| *minf=a3; | |||
| }else{ | |||
| *minf=a1; | |||
| } | |||
| return index; | |||
| } | |||
| BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { | |||
| BLASLONG i = 0; | |||
| BLASLONG j = 0; | |||
| BLASLONG min = 0; | |||
| FLOAT minf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return (min); | |||
| minf = ABS(x[0]); //index's not incremented | |||
| if (inc_x == 1) { | |||
| BLASLONG n1 = n & -64; | |||
| if (n1 > 0) { | |||
| min = siamin_kernel_64(n1, x, &minf); | |||
| i = n1; | |||
| } | |||
| while (i < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = i; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i++; | |||
| } | |||
| return (min + 1); | |||
| } else { | |||
| BLASLONG n1 = n & -4; | |||
| while (j < n1) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| if (ABS(x[i + inc_x]) < minf) { | |||
| min = j + 1; | |||
| minf = ABS(x[i + inc_x]); | |||
| } | |||
| if (ABS(x[i + 2 * inc_x]) < minf) { | |||
| min = j + 2; | |||
| minf = ABS(x[i + 2 * inc_x]); | |||
| } | |||
| if (ABS(x[i + 3 * inc_x]) < minf) { | |||
| min = j + 3; | |||
| minf = ABS(x[i + 3 * inc_x]); | |||
| } | |||
| i += inc_x * 4; | |||
| j += 4; | |||
| } | |||
| while (j < n) { | |||
| if (ABS(x[i]) < minf) { | |||
| min = j; | |||
| minf = ABS(x[i]); | |||
| } | |||
| i += inc_x; | |||
| j++; | |||
| } | |||
| return (min + 1); | |||
| } | |||
| } | |||
| @@ -1,272 +1,272 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs20 | |||
| #define save_permute_1 vs21 | |||
| #define save_permute_2 vs22 | |||
| #define permute_mask vs23 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define T11 r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "sgemm_macros_power9.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_11, 0x1415161718191a1b | |||
| .equ save_permute_12, 0x0405060708090a0b | |||
| .equ save_permute_21, 0x101112131c1d1e1f | |||
| .equ save_permute_22, 0x000102030c0d0e0f | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| lis T5, save_permute_22@highest | |||
| lis T6, save_permute_21@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| ori T5, T5, save_permute_22@higher | |||
| ori T6, T6, save_permute_21@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| rldicr T5, T5, 32, 31 | |||
| rldicr T6, T6, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| oris T5, T5, save_permute_22@h | |||
| oris T6, T6, save_permute_21@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| ori T5, T5, save_permute_22@l | |||
| ori T6, T6, save_permute_21@l | |||
| li r0,0 | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| mtvsrdd save_permute_2,T5,T6 | |||
| #include "sgemm_logic_power9.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,470 +1,470 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_n.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 4096 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; | |||
| FLOAT x0,x1,x2,x3,x4,x5,x6,x7; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| x4 = xo[4] * *alpha; | |||
| x5 = xo[5] * *alpha; | |||
| x6 = xo[6] * *alpha; | |||
| x7 = xo[7] * *alpha; | |||
| __vector float* va0 = (__vector float*)a0; | |||
| __vector float* va1 = (__vector float*)a1; | |||
| __vector float* va2 = (__vector float*)a2; | |||
| __vector float* va3 = (__vector float*)a3; | |||
| __vector float* vb0 = (__vector float*)b0; | |||
| __vector float* vb1 = (__vector float*)b1; | |||
| __vector float* vb2 = (__vector float*)b2; | |||
| __vector float* vb3 = (__vector float*)b3; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float v_x4 = {x4,x4,x4,x4}; | |||
| __vector float v_x5 = {x5,x5,x5,x5}; | |||
| __vector float v_x6 = {x6,x6,x6,x6}; | |||
| __vector float v_x7 = {x7,x7,x7,x7}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| for ( i=0; i< n/4; i++) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| vy += v_x4 * vb0[i] + v_x5 * vb1[i] + v_x6 * vb2[i] + v_x7 * vb3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1,x2,x3; | |||
| x0 = xo[0] * *alpha; | |||
| x1 = xo[1] * *alpha; | |||
| x2 = xo[2] * *alpha; | |||
| x3 = xo[3] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float v_x2 = {x2,x2,x2,x2}; | |||
| __vector float v_x3 = {x3,x3,x3,x3}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| __vector float* va2 = (__vector float*)ap[2]; | |||
| __vector float* va3 = (__vector float*)ap[3]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| register __vector float vy=v_y[i]; | |||
| vy += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; | |||
| v_y[i] =vy; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0,x1; | |||
| x0 = x[0] * *alpha; | |||
| x1 = x[1] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float v_x1 = {x1,x1,x1,x1}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap[0]; | |||
| __vector float* va1 = (__vector float*)ap[1]; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; | |||
| } | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x0 ; | |||
| x0 = x[0] * *alpha; | |||
| __vector float v_x0 = {x0,x0,x0,x0}; | |||
| __vector float* v_y =(__vector float*)y; | |||
| __vector float* va0 = (__vector float*)ap; | |||
| for ( i=0; i< n/4; i++ ) | |||
| { | |||
| v_y[i] += v_x0 * va0[i] ; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ){ | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| #endif | |||
| @@ -1,484 +1,484 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/gemv_t.c" | |||
| #else | |||
| #include "common.h" | |||
| #define NBMAX 2048 | |||
| #include <altivec.h> | |||
| static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| register __vector float temp4 = {0,0,0,0}; | |||
| register __vector float temp5 = {0,0,0,0}; | |||
| register __vector float temp6 = {0,0,0,0}; | |||
| register __vector float temp7 = {0,0,0,0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector float*) a0; | |||
| va1 = (__vector float*) a1; | |||
| va2 = (__vector float*) a2; | |||
| va3 = (__vector float*) a3; | |||
| va4 = (__vector float*) a4; | |||
| va5 = (__vector float*) a5; | |||
| va6 = (__vector float*) a6; | |||
| va7 = (__vector float*) a7; | |||
| v_x = (__vector float*) x; | |||
| for (i = 0; i < n/4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); | |||
| } | |||
| static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| register __vector float temp0 = {0,0,0,0}; | |||
| register __vector float temp1 = {0,0,0,0}; | |||
| register __vector float temp2 = {0,0,0,0}; | |||
| register __vector float temp3 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); | |||
| } | |||
| static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| __vector float temp1 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* v_x = (__vector float*) x; | |||
| __vector float temp0 = {0,0,0,0}; | |||
| for (i = 0; i < n / 4; i ++) { | |||
| temp0 += v_x[i] * va0[i] ; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| #endif | |||
| @@ -1,245 +1,245 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE 512 | |||
| #define FZERO 312+192(SP) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define o0 0 | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define VECSAVE r11 | |||
| #define FRAMEPOINTER r12 | |||
| #define T10 r14 | |||
| #define L r15 | |||
| #define T8 r16 | |||
| #define T5 r17 | |||
| #define T2 r19 | |||
| #define TEMP_REG r20 | |||
| #define T6 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T7 r27 | |||
| #define T3 r28 | |||
| #define T4 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| xxspltd alpha_r,vs1,0 /*copy from register f1 */ | |||
| xxspltd alpha_i,vs2,0 /*copy from register f2 */ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| #if defined(linux) || defined(__FreeBSD__) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_power9.S" | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 512 | |||
| li r0, 0 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegdp alpha_r,alpha_r | |||
| xvnegdp alpha_i,alpha_i | |||
| #endif | |||
| .align 4 | |||
| #include "zgemm_logic_power9.S" | |||
| L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -1,279 +1,279 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RN_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64) | |||
| #define SOLVE_RN_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192) | |||
| #define SOLVE_RN_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192)\ | |||
| SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(12,13,256)\ | |||
| SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(14,15,320) | |||
| #define SOLVE_RN_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1)\ | |||
| SOLVE_ri_m4n2(56,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,32) | |||
| #define SOLVE_RN_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(7,96) | |||
| #define SOLVE_RN_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(7,96)\ | |||
| SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(8,128)\ | |||
| SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(9,160) | |||
| #define SOLVE_RN_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0) | |||
| #define SOLVE_RN_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32) | |||
| #define SOLVE_RN_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32)\ | |||
| SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(8,9,64) | |||
| #define SOLVE_RN_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,0) | |||
| #define SOLVE_RN_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(5,16) | |||
| #define SOLVE_RN_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(5,16)\ | |||
| SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(6,32) | |||
| #define GEMM_RN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) | |||
| #define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) | |||
| #define GEMM_RN_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) | |||
| #define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) | |||
| #define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) | |||
| #define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) | |||
| #define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) | |||
| #define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) | |||
| #define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) | |||
| #define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) | |||
| #define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ | |||
| } | |||
| static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0; i<n; i++) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0; j<m; j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); | |||
| return 0; | |||
| } | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RN_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64) | |||
| #define SOLVE_RN_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192) | |||
| #define SOLVE_RN_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "movq %2,%3; addq $32,%2;"\ | |||
| SOLVE_leri_m8n2(0,4,5,%1) SUBTRACT_m8n2(8,6,7,%1) SUBTRACT_m8n2(0,8,9,%1,%%r12,4) SUBTRACT_m8n2(8,10,11,%1,%%r12,4) SUBTRACT_m8n2(0,12,13,%1,%%r12,8) SUBTRACT_m8n2(8,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(16,4,5,%1) SUBTRACT_m8n2(24,6,7,%1) SUBTRACT_m8n2(16,8,9,%1,%%r12,4) SUBTRACT_m8n2(24,10,11,%1,%%r12,4) SUBTRACT_m8n2(16,12,13,%1,%%r12,8) SUBTRACT_m8n2(24,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(4,5,0)\ | |||
| SOLVE_leri_m8n2(40,6,7,%1) SUBTRACT_m8n2(32,8,9,%1,%%r12,4) SUBTRACT_m8n2(40,10,11,%1,%%r12,4) SUBTRACT_m8n2(32,12,13,%1,%%r12,8) SUBTRACT_m8n2(40,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(56,6,7,%1) SUBTRACT_m8n2(48,8,9,%1,%%r12,4) SUBTRACT_m8n2(56,10,11,%1,%%r12,4) SUBTRACT_m8n2(48,12,13,%1,%%r12,8) SUBTRACT_m8n2(56,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(6,7,64)\ | |||
| SOLVE_leri_m8n2(64,8,9,%1,%%r12,4) SUBTRACT_m8n2(72,10,11,%1,%%r12,4) SUBTRACT_m8n2(64,12,13,%1,%%r12,8) SUBTRACT_m8n2(72,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(80,8,9,%1,%%r12,4) SUBTRACT_m8n2(88,10,11,%1,%%r12,4) SUBTRACT_m8n2(80,12,13,%1,%%r12,8) SUBTRACT_m8n2(88,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(8,9,128)\ | |||
| SOLVE_leri_m8n2(104,10,11,%1,%%r12,4) SUBTRACT_m8n2(96,12,13,%1,%%r12,8) SUBTRACT_m8n2(104,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(120,10,11,%1,%%r12,4) SUBTRACT_m8n2(112,12,13,%1,%%r12,8) SUBTRACT_m8n2(120,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(10,11,192)\ | |||
| SOLVE_leri_m8n2(128,12,13,%1,%%r12,8) SUBTRACT_m8n2(136,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(144,12,13,%1,%%r12,8) SUBTRACT_m8n2(152,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(12,13,256)\ | |||
| SOLVE_leri_m8n2(168,14,15,%1,%%r12,8)\ | |||
| SOLVE_ri_m8n2(184,14,15,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m8n2(14,15,320) | |||
| #define SOLVE_RN_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1)\ | |||
| SOLVE_ri_m4n2(56,5,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,32) | |||
| #define SOLVE_RN_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m4n2(7,96) | |||
| #define SOLVE_RN_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "movq %2,%3; addq $16,%2;"\ | |||
| SOLVE_leri_m4n2(0,4,%1) SUBTRACT_m4n2(8,5,%1) SUBTRACT_m4n2(0,6,%1,%%r12,4) SUBTRACT_m4n2(8,7,%1,%%r12,4) SUBTRACT_m4n2(0,8,%1,%%r12,8) SUBTRACT_m4n2(8,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(16,4,%1) SUBTRACT_m4n2(24,5,%1) SUBTRACT_m4n2(16,6,%1,%%r12,4) SUBTRACT_m4n2(24,7,%1,%%r12,4) SUBTRACT_m4n2(16,8,%1,%%r12,8) SUBTRACT_m4n2(24,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(4,0)\ | |||
| SOLVE_leri_m4n2(40,5,%1) SUBTRACT_m4n2(32,6,%1,%%r12,4) SUBTRACT_m4n2(40,7,%1,%%r12,4) SUBTRACT_m4n2(32,8,%1,%%r12,8) SUBTRACT_m4n2(40,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(56,5,%1) SUBTRACT_m4n2(48,6,%1,%%r12,4) SUBTRACT_m4n2(56,7,%1,%%r12,4) SUBTRACT_m4n2(48,8,%1,%%r12,8) SUBTRACT_m4n2(56,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(5,32)\ | |||
| SOLVE_leri_m4n2(64,6,%1,%%r12,4) SUBTRACT_m4n2(72,7,%1,%%r12,4) SUBTRACT_m4n2(64,8,%1,%%r12,8) SUBTRACT_m4n2(72,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(80,6,%1,%%r12,4) SUBTRACT_m4n2(88,7,%1,%%r12,4) SUBTRACT_m4n2(80,8,%1,%%r12,8) SUBTRACT_m4n2(88,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(6,64)\ | |||
| SOLVE_leri_m4n2(104,7,%1,%%r12,4) SUBTRACT_m4n2(96,8,%1,%%r12,8) SUBTRACT_m4n2(104,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(120,7,%1,%%r12,4) SUBTRACT_m4n2(112,8,%1,%%r12,8) SUBTRACT_m4n2(120,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(7,96)\ | |||
| SOLVE_leri_m4n2(128,8,%1,%%r12,8) SUBTRACT_m4n2(136,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(144,8,%1,%%r12,8) SUBTRACT_m4n2(152,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(8,128)\ | |||
| SOLVE_leri_m4n2(168,9,%1,%%r12,8)\ | |||
| SOLVE_ri_m4n2(184,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m4n2(9,160) | |||
| #define SOLVE_RN_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0) | |||
| #define SOLVE_RN_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32) | |||
| #define SOLVE_RN_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "movq %2,%3; addq $8,%2;"\ | |||
| SOLVE_col1_ltor_m2n4(0,4,5,%1) SUBTRACT_m2n4(0,6,7,%1,%%r12,4) SUBTRACT_m2n4(0,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(16,4,5,%1) SUBTRACT_m2n4(16,6,7,%1,%%r12,4) SUBTRACT_m2n4(16,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(32,4,5,%1) SUBTRACT_m2n4(32,6,7,%1,%%r12,4) SUBTRACT_m2n4(32,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(48,4,5,%1) SUBTRACT_m2n4(48,6,7,%1,%%r12,4) SUBTRACT_m2n4(48,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(4,5,0)\ | |||
| SOLVE_col1_ltor_m2n4(64,6,7,%1,%%r12,4) SUBTRACT_m2n4(64,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(80,6,7,%1,%%r12,4) SUBTRACT_m2n4(80,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(96,6,7,%1,%%r12,4) SUBTRACT_m2n4(96,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(112,6,7,%1,%%r12,4) SUBTRACT_m2n4(112,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(6,7,32)\ | |||
| SOLVE_col1_ltor_m2n4(128,8,9,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m2n4(144,8,9,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m2n4(160,8,9,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m2n4(176,8,9,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m2n4(8,9,64) | |||
| #define SOLVE_RN_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,0) | |||
| #define SOLVE_RN_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4)\ | |||
| SAVE_SOLUTION_m1n4(5,16) | |||
| #define SOLVE_RN_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "movq %2,%3; addq $4,%2;"\ | |||
| SOLVE_col1_ltor_m1n4(0,4,%1) SUBTRACT_m1n4(0,5,%1,%%r12,4) SUBTRACT_m1n4(0,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(16,4,%1) SUBTRACT_m1n4(16,5,%1,%%r12,4) SUBTRACT_m1n4(16,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(32,4,%1) SUBTRACT_m1n4(32,5,%1,%%r12,4) SUBTRACT_m1n4(32,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(48,4,%1) SUBTRACT_m1n4(48,5,%1,%%r12,4) SUBTRACT_m1n4(48,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(4,0)\ | |||
| SOLVE_col1_ltor_m1n4(64,5,%1,%%r12,4) SUBTRACT_m1n4(64,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(80,5,%1,%%r12,4) SUBTRACT_m1n4(80,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(96,5,%1,%%r12,4) SUBTRACT_m1n4(96,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(112,5,%1,%%r12,4) SUBTRACT_m1n4(112,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(5,16)\ | |||
| SOLVE_col1_ltor_m1n4(128,6,%1,%%r12,8)\ | |||
| SOLVE_col2_ltor_m1n4(144,6,%1,%%r12,8)\ | |||
| SOLVE_col3_ltor_m1n4(160,6,%1,%%r12,8)\ | |||
| SOLVE_col4_ltor_m1n4(176,6,%1,%%r12,8)\ | |||
| SAVE_SOLUTION_m1n4(6,32) | |||
| #define GEMM_RN_SIMPLE(mdim,ndim) \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| GEMM_KERNEL_k1m##mdim##n##ndim "addq $16,%1; addq $"#mdim"*4,%0; decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RN_m8n4 GEMM_RN_SIMPLE(8,4) | |||
| #define GEMM_RN_m8n8 GEMM_RN_SIMPLE(8,8) | |||
| #define GEMM_RN_m8n12 \ | |||
| "movq %%r15,%0; leaq (%%r15,%%r12,8),%%r15; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "prefetcht0 384(%0); addq $32,%0; addq $16,%1;"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1;"\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| GEMM_KERNEL_k1m8n12 "addq $32,%0; addq $16,%1; decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RN_m4n4 GEMM_RN_SIMPLE(4,4) | |||
| #define GEMM_RN_m4n8 GEMM_RN_SIMPLE(4,8) | |||
| #define GEMM_RN_m4n12 GEMM_RN_SIMPLE(4,12) | |||
| #define GEMM_RN_m2n4 GEMM_RN_SIMPLE(2,4) | |||
| #define GEMM_RN_m2n8 GEMM_RN_SIMPLE(2,8) | |||
| #define GEMM_RN_m2n12 GEMM_RN_SIMPLE(2,12) | |||
| #define GEMM_RN_m1n4 GEMM_RN_SIMPLE(1,4) | |||
| #define GEMM_RN_m1n8 GEMM_RN_SIMPLE(1,8) | |||
| #define GEMM_RN_m1n12 GEMM_RN_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %1,%%r14; movq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RN_m8n##ndim SOLVE_RN_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RN_m4n##ndim SOLVE_RN_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RN_m2n##ndim SOLVE_RN_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RN_m1n##ndim SOLVE_RN_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr += ndim * K; c_ptr += ldc * ndim - M; OFF += ndim;\ | |||
| } | |||
| static void solve_RN(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=0; i<n; i++) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0; j<m; j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=i+1; k<n; k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(kk>0) GEMM_KERNEL_N(8,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(8,n,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(kk>0) GEMM_KERNEL_N(4,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(4,n,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(kk>0) GEMM_KERNEL_N(2,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(2,n,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(kk>0) GEMM_KERNEL_N(1,n,kk,-1.0,a_ptr,sb,c_ptr,ldc); | |||
| solve_RN(1,n,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb, *c_ptr = C, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)-offset, k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| for(;n_count>1;n_count-=2) { COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); b_ptr += 2*k; c_ptr += ldc*2; OFF+=2;} | |||
| if(n_count>0) COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); | |||
| return 0; | |||
| } | |||
| @@ -1,281 +1,281 @@ | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-128) | |||
| #define SOLVE_RT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-256) | |||
| #define SOLVE_RT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ | |||
| SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-176,4,5,%1)\ | |||
| SOLVE_le_m8n2(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-384) | |||
| #define SOLVE_RT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-64) | |||
| #define SOLVE_RT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-128) | |||
| #define SOLVE_RT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ | |||
| SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-176,4,%1)\ | |||
| SOLVE_le_m4n2(-192,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-192) | |||
| #define SOLVE_RT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-32) | |||
| #define SOLVE_RT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-64) | |||
| #define SOLVE_RT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-96) | |||
| #define SOLVE_RT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-16) | |||
| #define SOLVE_RT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-32) | |||
| #define SOLVE_RT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-144,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-160,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-176,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-192,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-48) | |||
| /* r14 = b_tail, r15 = a_tail, r13 = k-kk */ | |||
| #define GEMM_RT_SIMPLE(mdim,ndim) \ | |||
| "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) | |||
| #define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) | |||
| #define GEMM_RT_m8n12 \ | |||
| "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) | |||
| #define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) | |||
| #define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) | |||
| #define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) | |||
| #define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) | |||
| #define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) | |||
| #define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) | |||
| #define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) | |||
| #define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ | |||
| } | |||
| static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=n-1;i>=0;i--) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0;j<m;j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} | |||
| if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| return 0; | |||
| } | |||
| #include "common.h" | |||
| #include <stdint.h> | |||
| #include "strsm_kernel_8x4_haswell_R_common.h" | |||
| #define SOLVE_RT_m8n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-128) | |||
| #define SOLVE_RT_m8n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-256) | |||
| #define SOLVE_RT_m8n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_8x4(4,5,6,7,63) GEMM_SUM_REORDER_8x4(8,9,10,11,63) GEMM_SUM_REORDER_8x4(12,13,14,15,63) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $32,%2;"\ | |||
| SOLVE_rile_m8n2(-8,14,15,%1,%%r12,8) SUBTRACT_m8n2(-16,12,13,%1,%%r12,8) SUBTRACT_m8n2(-8,10,11,%1,%%r12,4) SUBTRACT_m8n2(-16,8,9,%1,%%r12,4) SUBTRACT_m8n2(-8,6,7,%1) SUBTRACT_m8n2(-16,4,5,%1)\ | |||
| SOLVE_le_m8n2(-24,14,15,%1,%%r12,8) SUBTRACT_m8n2(-32,12,13,%1,%%r12,8) SUBTRACT_m8n2(-24,10,11,%1,%%r12,4) SUBTRACT_m8n2(-32,8,9,%1,%%r12,4) SUBTRACT_m8n2(-24,6,7,%1) SUBTRACT_m8n2(-32,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(14,15,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-48,12,13,%1,%%r12,8) SUBTRACT_m8n2(-40,10,11,%1,%%r12,4) SUBTRACT_m8n2(-48,8,9,%1,%%r12,4) SUBTRACT_m8n2(-40,6,7,%1) SUBTRACT_m8n2(-48,4,5,%1)\ | |||
| SOLVE_le_m8n2(-64,12,13,%1,%%r12,8) SUBTRACT_m8n2(-56,10,11,%1,%%r12,4) SUBTRACT_m8n2(-64,8,9,%1,%%r12,4) SUBTRACT_m8n2(-56,6,7,%1) SUBTRACT_m8n2(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(12,13,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-72,10,11,%1,%%r12,4) SUBTRACT_m8n2(-80,8,9,%1,%%r12,4) SUBTRACT_m8n2(-72,6,7,%1) SUBTRACT_m8n2(-80,4,5,%1)\ | |||
| SOLVE_le_m8n2(-88,10,11,%1,%%r12,4) SUBTRACT_m8n2(-96,8,9,%1,%%r12,4) SUBTRACT_m8n2(-88,6,7,%1) SUBTRACT_m8n2(-96,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(10,11,-192) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-112,8,9,%1,%%r12,4) SUBTRACT_m8n2(-104,6,7,%1) SUBTRACT_m8n2(-112,4,5,%1)\ | |||
| SOLVE_le_m8n2(-128,8,9,%1,%%r12,4) SUBTRACT_m8n2(-120,6,7,%1) SUBTRACT_m8n2(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(8,9,-256) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-136,6,7,%1) SUBTRACT_m8n2(-144,4,5,%1)\ | |||
| SOLVE_le_m8n2(-152,6,7,%1) SUBTRACT_m8n2(-160,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(6,7,-320) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m8n2(-176,4,5,%1)\ | |||
| SOLVE_le_m8n2(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m8n2(4,5,-384) | |||
| #define SOLVE_RT_m4n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-64) | |||
| #define SOLVE_RT_m4n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-128) | |||
| #define SOLVE_RT_m4n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_4x4(4,5,6,7,4,5) GEMM_SUM_REORDER_4x4(8,9,10,11,6,7) GEMM_SUM_REORDER_4x4(12,13,14,15,8,9) "negq %4; leaq (%3,%4,2),%3; negq %4; addq $16,%2;"\ | |||
| SOLVE_rile_m4n2(-8,9,%1,%%r12,8) SUBTRACT_m4n2(-16,8,%1,%%r12,8) SUBTRACT_m4n2(-8,7,%1,%%r12,4) SUBTRACT_m4n2(-16,6,%1,%%r12,4) SUBTRACT_m4n2(-8,5,%1) SUBTRACT_m4n2(-16,4,%1)\ | |||
| SOLVE_le_m4n2(-24,9,%1,%%r12,8) SUBTRACT_m4n2(-32,8,%1,%%r12,8) SUBTRACT_m4n2(-24,7,%1,%%r12,4) SUBTRACT_m4n2(-32,6,%1,%%r12,4) SUBTRACT_m4n2(-24,5,%1) SUBTRACT_m4n2(-32,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(9,-32) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-48,8,%1,%%r12,8) SUBTRACT_m4n2(-40,7,%1,%%r12,4) SUBTRACT_m4n2(-48,6,%1,%%r12,4) SUBTRACT_m4n2(-40,5,%1) SUBTRACT_m4n2(-48,4,%1)\ | |||
| SOLVE_le_m4n2(-64,8,%1,%%r12,8) SUBTRACT_m4n2(-56,7,%1,%%r12,4) SUBTRACT_m4n2(-64,6,%1,%%r12,4) SUBTRACT_m4n2(-56,5,%1) SUBTRACT_m4n2(-64,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(8,-64) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-72,7,%1,%%r12,4) SUBTRACT_m4n2(-80,6,%1,%%r12,4) SUBTRACT_m4n2(-72,5,%1) SUBTRACT_m4n2(-80,4,%1)\ | |||
| SOLVE_le_m4n2(-88,7,%1,%%r12,4) SUBTRACT_m4n2(-96,6,%1,%%r12,4) SUBTRACT_m4n2(-88,5,%1) SUBTRACT_m4n2(-96,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(7,-96) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-112,6,%1,%%r12,4) SUBTRACT_m4n2(-104,5,%1) SUBTRACT_m4n2(-112,4,%1)\ | |||
| SOLVE_le_m4n2(-128,6,%1,%%r12,4) SUBTRACT_m4n2(-120,5,%1) SUBTRACT_m4n2(-128,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(6,-128) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-136,5,%1) SUBTRACT_m4n2(-144,4,%1)\ | |||
| SOLVE_le_m4n2(-152,5,%1) SUBTRACT_m4n2(-160,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(5,-160) "negq %4; leaq (%3,%4,4),%3; negq %4;"\ | |||
| SOLVE_rile_m4n2(-176,4,%1)\ | |||
| SOLVE_le_m4n2(-192,4,%1)\ | |||
| SAVE_SOLUTION_m4n2(4,-192) | |||
| #define SOLVE_RT_m2n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-32) | |||
| #define SOLVE_RT_m2n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-64) | |||
| #define SOLVE_RT_m2n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_2x4(4,5) GEMM_SUM_REORDER_2x4(6,7) GEMM_SUM_REORDER_2x4(8,9) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $8,%2;"\ | |||
| SOLVE_col4_rtol_m2n4(-16,8,9,%1,%%r12,8) SUBTRACT_m2n4(-16,6,7,%1,%%r12,4) SUBTRACT_m2n4(-16,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-32,8,9,%1,%%r12,8) SUBTRACT_m2n4(-32,6,7,%1,%%r12,4) SUBTRACT_m2n4(-32,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-48,8,9,%1,%%r12,8) SUBTRACT_m2n4(-48,6,7,%1,%%r12,4) SUBTRACT_m2n4(-48,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-64,8,9,%1,%%r12,8) SUBTRACT_m2n4(-64,6,7,%1,%%r12,4) SUBTRACT_m2n4(-64,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(8,9,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-80,6,7,%1,%%r12,4) SUBTRACT_m2n4(-80,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-96,6,7,%1,%%r12,4) SUBTRACT_m2n4(-96,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-112,6,7,%1,%%r12,4) SUBTRACT_m2n4(-112,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-128,6,7,%1,%%r12,4) SUBTRACT_m2n4(-128,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(6,7,-64) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m2n4(-144,4,5,%1)\ | |||
| SOLVE_col3_rtol_m2n4(-160,4,5,%1)\ | |||
| SOLVE_col2_rtol_m2n4(-176,4,5,%1)\ | |||
| SOLVE_col1_rtol_m2n4(-192,4,5,%1)\ | |||
| SAVE_SOLUTION_m2n4(4,5,-96) | |||
| #define SOLVE_RT_m1n4 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-16) | |||
| #define SOLVE_RT_m1n8 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-32) | |||
| #define SOLVE_RT_m1n12 \ | |||
| "movq %2,%3;" GEMM_SUM_REORDER_1x4(4) GEMM_SUM_REORDER_1x4(5) GEMM_SUM_REORDER_1x4(6) "negq %4; leaq (%3,%4,4),%3; negq %4; addq $4,%2;"\ | |||
| SOLVE_col4_rtol_m1n4(-16,6,%1,%%r12,8) SUBTRACT_m1n4(-16,5,%1,%%r12,4) SUBTRACT_m1n4(-16,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-32,6,%1,%%r12,8) SUBTRACT_m1n4(-32,5,%1,%%r12,4) SUBTRACT_m1n4(-32,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-48,6,%1,%%r12,8) SUBTRACT_m1n4(-48,5,%1,%%r12,4) SUBTRACT_m1n4(-48,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-64,6,%1,%%r12,8) SUBTRACT_m1n4(-64,5,%1,%%r12,4) SUBTRACT_m1n4(-64,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(6,-16) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-80,5,%1,%%r12,4) SUBTRACT_m1n4(-80,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-96,5,%1,%%r12,4) SUBTRACT_m1n4(-96,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-112,5,%1,%%r12,4) SUBTRACT_m1n4(-112,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-128,5,%1,%%r12,4) SUBTRACT_m1n4(-128,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(5,-32) "negq %4; leaq (%3,%4,8),%3; negq %4;"\ | |||
| SOLVE_col4_rtol_m1n4(-144,4,%1)\ | |||
| SOLVE_col3_rtol_m1n4(-160,4,%1)\ | |||
| SOLVE_col2_rtol_m1n4(-176,4,%1)\ | |||
| SOLVE_col1_rtol_m1n4(-192,4,%1)\ | |||
| SAVE_SOLUTION_m1n4(4,-48) | |||
| /* r14 = b_tail, r15 = a_tail, r13 = k-kk */ | |||
| #define GEMM_RT_SIMPLE(mdim,ndim) \ | |||
| "leaq (%%r15,%%r12,"#mdim"),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m##mdim##n##ndim\ | |||
| "testq %5,%5; jz 1"#mdim""#ndim"2f;"\ | |||
| "1"#mdim""#ndim"1:\n\t"\ | |||
| "subq $16,%1; subq $"#mdim"*4,%0;" GEMM_KERNEL_k1m##mdim##n##ndim "decq %5; jnz 1"#mdim""#ndim"1b;"\ | |||
| "1"#mdim""#ndim"2:\n\t" | |||
| #define GEMM_RT_m8n4 GEMM_RT_SIMPLE(8,4) | |||
| #define GEMM_RT_m8n8 GEMM_RT_SIMPLE(8,8) | |||
| #define GEMM_RT_m8n12 \ | |||
| "leaq (%%r15,%%r12,8),%%r15; movq %%r15,%0; movq %%r13,%5; movq %%r14,%1;" INIT_m8n12\ | |||
| "cmpq $8,%5; jb 18122f;"\ | |||
| "18121:\n\t"\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "prefetcht0 -384(%0); subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12\ | |||
| "subq $8,%5; cmpq $8,%5; jnb 18121b;"\ | |||
| "18122:\n\t"\ | |||
| "testq %5,%5; jz 18124f;"\ | |||
| "18123:\n\t"\ | |||
| "subq $32,%0; subq $16,%1;" GEMM_KERNEL_k1m8n12 "decq %5; jnz 18123b;"\ | |||
| "18124:\n\t" | |||
| #define GEMM_RT_m4n4 GEMM_RT_SIMPLE(4,4) | |||
| #define GEMM_RT_m4n8 GEMM_RT_SIMPLE(4,8) | |||
| #define GEMM_RT_m4n12 GEMM_RT_SIMPLE(4,12) | |||
| #define GEMM_RT_m2n4 GEMM_RT_SIMPLE(2,4) | |||
| #define GEMM_RT_m2n8 GEMM_RT_SIMPLE(2,8) | |||
| #define GEMM_RT_m2n12 GEMM_RT_SIMPLE(2,12) | |||
| #define GEMM_RT_m1n4 GEMM_RT_SIMPLE(1,4) | |||
| #define GEMM_RT_m1n8 GEMM_RT_SIMPLE(1,8) | |||
| #define GEMM_RT_m1n12 GEMM_RT_SIMPLE(1,12) | |||
| #define COMPUTE(ndim) {\ | |||
| b_ptr -= (ndim-4)*K; c_ptr -= ndim * ldc;\ | |||
| __asm__ __volatile__(\ | |||
| "movq %0,%%r15; movq %6,%%r13; subq %7,%%r13; movq %6,%%r12; salq $2,%%r12; movq %1,%%r14; movq %10,%%r11;"\ | |||
| "cmpq $8,%%r11; jb "#ndim"772f;"\ | |||
| #ndim"771:\n\t"\ | |||
| GEMM_RT_m8n##ndim SOLVE_RT_m8n##ndim "subq $8,%%r11; cmpq $8,%%r11; jnb "#ndim"771b;"\ | |||
| #ndim"772:\n\t"\ | |||
| "testq $4,%%r11; jz "#ndim"773f;"\ | |||
| GEMM_RT_m4n##ndim SOLVE_RT_m4n##ndim "subq $4,%%r11;"\ | |||
| #ndim"773:\n\t"\ | |||
| "testq $2,%%r11; jz "#ndim"774f;"\ | |||
| GEMM_RT_m2n##ndim SOLVE_RT_m2n##ndim "subq $2,%%r11;"\ | |||
| #ndim"774:\n\t"\ | |||
| "testq $1,%%r11; jz "#ndim"775f;"\ | |||
| GEMM_RT_m1n##ndim SOLVE_RT_m1n##ndim "subq $1,%%r11;"\ | |||
| #ndim"775:\n\t"\ | |||
| "movq %%r15,%0; movq %%r14,%1; vzeroupper;"\ | |||
| :"+r"(a_ptr),"+r"(b_ptr),"+r"(c_ptr),"+r"(c_tmp),"+r"(ldc_bytes),"+r"(k_cnt):"m"(K),"m"(OFF),"m"(one[0]),"m"(zero[0]),"m"(M)\ | |||
| :"r11","r12","r13","r14","r15","cc","memory",\ | |||
| "xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","xmm8","xmm9","xmm10","xmm11","xmm12","xmm13","xmm14","xmm15");\ | |||
| a_ptr -= M * K; b_ptr -= 4 * K; c_ptr -= M; OFF -= ndim;\ | |||
| } | |||
| static void solve_RT(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc){ | |||
| FLOAT a0, b0; | |||
| int i, j, k; | |||
| for (i=n-1;i>=0;i--) { | |||
| b0 = b[i*n+i]; | |||
| for (j=0;j<m;j++) { | |||
| a0 = c[i*ldc+j] * b0; | |||
| a[i*m+j] = c[i*ldc+j] = a0; | |||
| for (k=0;k<i;k++) c[k*ldc+j] -= a0 * b[i*n+k]; | |||
| } | |||
| } | |||
| } | |||
| static void COMPUTE_EDGE_1_nchunk(BLASLONG m, BLASLONG n, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG k, BLASLONG offset) { | |||
| BLASLONG m_count = m, kk = offset; FLOAT *a_ptr = sa, *c_ptr = C; | |||
| for(;m_count>7;m_count-=8){ | |||
| if(k-kk>0) GEMM_KERNEL_N(8,n,k-kk,-1.0,a_ptr+kk*8,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(8,n,a_ptr+(kk-n)*8,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 8; c_ptr += 8; | |||
| } | |||
| for(;m_count>3;m_count-=4){ | |||
| if(k-kk>0) GEMM_KERNEL_N(4,n,k-kk,-1.0,a_ptr+kk*4,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(4,n,a_ptr+(kk-n)*4,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 4; c_ptr += 4; | |||
| } | |||
| for(;m_count>1;m_count-=2){ | |||
| if(k-kk>0) GEMM_KERNEL_N(2,n,k-kk,-1.0,a_ptr+kk*2,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(2,n,a_ptr+(kk-n)*2,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 2; c_ptr += 2; | |||
| } | |||
| if(m_count>0){ | |||
| if(k-kk>0) GEMM_KERNEL_N(1,n,k-kk,-1.0,a_ptr+kk*1,sb+kk*n,c_ptr,ldc); | |||
| solve_RT(1,n,a_ptr+(kk-n)*1,sb+(kk-n)*n,c_ptr,ldc); | |||
| a_ptr += k * 1; c_ptr += 1; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *sa, FLOAT *sb, FLOAT *C, BLASLONG ldc, BLASLONG offset){ | |||
| float *a_ptr = sa, *b_ptr = sb+n*k, *c_ptr = C+n*ldc, *c_tmp = C; | |||
| float one[8] = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}; | |||
| float zero[8] = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}; | |||
| uint64_t ldc_bytes = (uint64_t)ldc * sizeof(float), K = (uint64_t)k, M = (uint64_t)m, OFF = (uint64_t)(n-offset), k_cnt = 0; | |||
| BLASLONG n_count = n; | |||
| if(n&1){b_ptr-=k; c_ptr-=ldc; COMPUTE_EDGE_1_nchunk(m,1,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF--; n_count--;} | |||
| if(n&2){b_ptr-=k*2; c_ptr-=ldc*2; COMPUTE_EDGE_1_nchunk(m,2,a_ptr,b_ptr,c_ptr,ldc,k,OFF); OFF-=2; n_count-=2;} | |||
| for(;n_count>11;n_count-=12) COMPUTE(12) | |||
| for(;n_count>7;n_count-=8) COMPUTE(8) | |||
| for(;n_count>3;n_count-=4) COMPUTE(4) | |||
| return 0; | |||
| } | |||
| @@ -1,226 +1,226 @@ | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
| "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
| "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
| "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_le_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
| "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| /* r11 = m_counter, r12 = size_of_k_elements, r13 = kk, r14 = b_head, r15 = a_head */ | |||
| /* register i/o: %0 = a_ptr, %1 = b_ptr, %2 = c_ptr, %3 = c_tmp, %4 = ldc, %5 = k_counter */ | |||
| /* memory input: %6 = K, %7 = offset, %8 = {1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0}, %9 = {0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0}, %10 = M */ | |||
| #define init_m8n4(c1,c2,c3,c4)\ | |||
| "vpxor %%ymm"#c1",%%ymm"#c1",%%ymm"#c1"; vpxor %%ymm"#c2",%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vpxor %%ymm"#c3",%%ymm"#c3",%%ymm"#c3"; vpxor %%ymm"#c4",%%ymm"#c4",%%ymm"#c4";" | |||
| #define INIT_m8n4 init_m8n4(4,5,6,7) | |||
| #define INIT_m8n8 INIT_m8n4 init_m8n4(8,9,10,11) | |||
| #define INIT_m8n12 INIT_m8n8 init_m8n4(12,13,14,15) | |||
| #define init_m4n4(c1,c2,c3,c4)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpxor %%xmm"#c3",%%xmm"#c3",%%xmm"#c3"; vpxor %%xmm"#c4",%%xmm"#c4",%%xmm"#c4";" | |||
| #define INIT_m4n4 init_m4n4(4,5,6,7) | |||
| #define INIT_m4n8 INIT_m4n4 init_m4n4(8,9,10,11) | |||
| #define INIT_m4n12 INIT_m4n8 init_m4n4(12,13,14,15) | |||
| #define init_m2n4(c1,c2)\ | |||
| "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1"; vpxor %%xmm"#c2",%%xmm"#c2",%%xmm"#c2";" | |||
| #define INIT_m2n4 init_m2n4(4,5) | |||
| #define INIT_m2n8 INIT_m2n4 init_m2n4(6,7) | |||
| #define INIT_m2n12 INIT_m2n8 init_m2n4(8,9) | |||
| #define init_m1n4(c1) "vpxor %%xmm"#c1",%%xmm"#c1",%%xmm"#c1";" | |||
| #define INIT_m1n4 init_m1n4(4) | |||
| #define INIT_m1n8 INIT_m1n4 init_m1n4(5) | |||
| #define INIT_m1n12 INIT_m1n8 init_m1n4(6) | |||
| #define GEMM_KERNEL_k1m8n4 \ | |||
| "vmovsldup (%0),%%ymm1; vmovshdup (%0),%%ymm2;"\ | |||
| "vbroadcastsd (%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm4; vfnmadd231ps %%ymm3,%%ymm2,%%ymm5;"\ | |||
| "vbroadcastsd 8(%1),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm6; vfnmadd231ps %%ymm3,%%ymm2,%%ymm7;" | |||
| #define GEMM_KERNEL_k1m8n8 GEMM_KERNEL_k1m8n4\ | |||
| "vbroadcastsd (%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm8; vfnmadd231ps %%ymm3,%%ymm2,%%ymm9;"\ | |||
| "vbroadcastsd 8(%1,%%r12,4),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm10; vfnmadd231ps %%ymm3,%%ymm2,%%ymm11;" | |||
| #define GEMM_KERNEL_k1m8n12 GEMM_KERNEL_k1m8n8\ | |||
| "vbroadcastsd (%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm12; vfnmadd231ps %%ymm3,%%ymm2,%%ymm13;"\ | |||
| "vbroadcastsd 8(%1,%%r12,8),%%ymm3; vfnmadd231ps %%ymm3,%%ymm1,%%ymm14; vfnmadd231ps %%ymm3,%%ymm2,%%ymm15;" | |||
| #define GEMM_KERNEL_k1m4n4 \ | |||
| "vmovsldup (%0),%%xmm1; vmovshdup (%0),%%xmm2;"\ | |||
| "vmovddup (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;"\ | |||
| "vmovddup 8(%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m4n8 GEMM_KERNEL_k1m4n4\ | |||
| "vmovddup (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;"\ | |||
| "vmovddup 8(%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm10; vfnmadd231ps %%xmm3,%%xmm2,%%xmm11;" | |||
| #define GEMM_KERNEL_k1m4n12 GEMM_KERNEL_k1m4n8\ | |||
| "vmovddup (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm12; vfnmadd231ps %%xmm3,%%xmm2,%%xmm13;"\ | |||
| "vmovddup 8(%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm14; vfnmadd231ps %%xmm3,%%xmm2,%%xmm15;" | |||
| #define GEMM_KERNEL_k1m2n4 \ | |||
| "vbroadcastss (%0),%%xmm1; vbroadcastss 4(%0),%%xmm2;"\ | |||
| "vmovups (%1),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm4; vfnmadd231ps %%xmm3,%%xmm2,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m2n8 GEMM_KERNEL_k1m2n4\ | |||
| "vmovups (%1,%%r12,4),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm6; vfnmadd231ps %%xmm3,%%xmm2,%%xmm7;" | |||
| #define GEMM_KERNEL_k1m2n12 GEMM_KERNEL_k1m2n8\ | |||
| "vmovups (%1,%%r12,8),%%xmm3; vfnmadd231ps %%xmm3,%%xmm1,%%xmm8; vfnmadd231ps %%xmm3,%%xmm2,%%xmm9;" | |||
| #define GEMM_KERNEL_k1m1n4 "vbroadcastss (%0),%%xmm1; vfnmadd231ps (%1),%%xmm1,%%xmm4;" | |||
| #define GEMM_KERNEL_k1m1n8 GEMM_KERNEL_k1m1n4 "vfnmadd231ps (%1,%%r12,4),%%xmm1,%%xmm5;" | |||
| #define GEMM_KERNEL_k1m1n12 GEMM_KERNEL_k1m1n8 "vfnmadd231ps (%1,%%r12,8),%%xmm1,%%xmm6;" | |||
| #define GEMM_SUM_REORDER_8x4(c1,c2,c3,c4,prefpos)\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c1",%%ymm"#c1"; vaddps %%ymm1,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovups (%3),%%ymm0; vmovups (%3,%4,1),%%ymm1; prefetcht1 "#prefpos"(%3); prefetcht1 "#prefpos"(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%ymm1,%%ymm0,%%ymm2; vunpckhps %%ymm1,%%ymm0,%%ymm3; vunpcklpd %%ymm3,%%ymm2,%%ymm0; vunpckhpd %%ymm3,%%ymm2,%%ymm1;"\ | |||
| "vaddps %%ymm0,%%ymm"#c3",%%ymm"#c3"; vaddps %%ymm1,%%ymm"#c4",%%ymm"#c4";" | |||
| #define GEMM_SUM_REORDER_4x4(c1,c2,c3,c4,co1,co2)\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c2",%%xmm"#c1",%%xmm0; vunpckhpd %%xmm"#c2",%%xmm"#c1",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c1"; vaddps %%xmm1,%%xmm3,%%xmm"#c2";"\ | |||
| "vmovups (%3),%%xmm0; vmovups (%3,%4,1),%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vunpcklps %%xmm1,%%xmm0,%%xmm2; vunpckhps %%xmm1,%%xmm0,%%xmm3;"\ | |||
| "vunpcklpd %%xmm"#c4",%%xmm"#c3",%%xmm0; vunpckhpd %%xmm"#c4",%%xmm"#c3",%%xmm1;"\ | |||
| "vaddps %%xmm0,%%xmm2,%%xmm"#c3"; vaddps %%xmm1,%%xmm3,%%xmm"#c4";"\ | |||
| "vperm2f128 $2,%%ymm"#c1",%%ymm"#c2",%%ymm"#co1"; vperm2f128 $2,%%ymm"#c3",%%ymm"#c4",%%ymm"#co2";" | |||
| #define GEMM_SUM_REORDER_2x4(c1,c2)\ | |||
| "vmovsd (%3),%%xmm0; vmovhpd (%3,%4,1),%%xmm0,%%xmm0; leaq (%3,%4,2),%3; vpermilps $216,%%xmm0,%%xmm0;"\ | |||
| "vmovsd (%3),%%xmm1; vmovhpd (%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3; vpermilps $216,%%xmm1,%%xmm1;"\ | |||
| "vunpcklpd %%xmm1,%%xmm0,%%xmm2; vaddps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vunpckhpd %%xmm1,%%xmm0,%%xmm3; vaddps %%xmm3,%%xmm"#c2",%%xmm"#c2";"\ | |||
| #define GEMM_SUM_REORDER_1x4(c1)\ | |||
| "vmovss (%3),%%xmm1; vinsertps $16,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vinsertps $32,(%3),%%xmm1,%%xmm1; vinsertps $48,(%3,%4,1),%%xmm1,%%xmm1; leaq (%3,%4,2),%3;"\ | |||
| "vaddps %%xmm"#c1",%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_le_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_le_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $170,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovsldup %%ymm"#c1",%%ymm1; vmovsldup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_leri_m4n2(b_off,c1,...) SOLVE_le_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_leri_m8n2(b_off,c1,c2,...) SOLVE_le_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $85,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_ri_m4n2(b_off,c1,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1;" | |||
| #define SOLVE_ri_m8n2(b_off,c1,c2,...)\ | |||
| "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vblendps $85,%8,%%ymm0,%%ymm2;"\ | |||
| "vmulps %%ymm2,%%ymm"#c1",%%ymm"#c1"; vmulps %%ymm2,%%ymm"#c2",%%ymm"#c2";"\ | |||
| "vmovshdup %%ymm"#c1",%%ymm1; vmovshdup %%ymm"#c2",%%ymm2;" | |||
| #define SOLVE_rile_m4n2(b_off,c1,...) SOLVE_ri_m4n2(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SOLVE_rile_m8n2(b_off,c1,c2,...) SOLVE_ri_m8n2(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $170,%9,%%ymm0,%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1"; vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SOLVE_col1_rtol_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col1_rtol_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $14,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $0,%%xmm"#c1",%%xmm1; vpermilps $0,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col1_ltor_m1n4(b_off,c1,...) SOLVE_col1_rtol_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col1_ltor_m2n4(b_off,c1,c2,...) SOLVE_col1_rtol_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $1,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col2_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $13,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $85,%%xmm"#c1",%%xmm1; vpermilps $85,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col2_rtol_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_rtol_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $14,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col2_ltor_m1n4(b_off,c1,...) SOLVE_col2_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col2_ltor_m2n4(b_off,c1,c2,...) SOLVE_col2_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $3,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_mul_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col3_mul_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $11,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $170,%%xmm"#c1",%%xmm1; vpermilps $170,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col3_rtol_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_rtol_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $12,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col3_ltor_m1n4(b_off,c1,...) SOLVE_col3_mul_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col3_ltor_m2n4(b_off,c1,c2,...) SOLVE_col3_mul_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $7,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SOLVE_col4_ltor_m1n4(b_off,c1,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1;" | |||
| #define SOLVE_col4_ltor_m2n4(b_off,c1,c2,...)\ | |||
| "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vblendps $7,%8,%%xmm0,%%xmm2;"\ | |||
| "vmulps %%xmm2,%%xmm"#c1",%%xmm"#c1"; vmulps %%xmm2,%%xmm"#c2",%%xmm"#c2";"\ | |||
| "vpermilps $255,%%xmm"#c1",%%xmm1; vpermilps $255,%%xmm"#c2",%%xmm2;" | |||
| #define SOLVE_col4_rtol_m1n4(b_off,c1,...) SOLVE_col4_ltor_m1n4(b_off,c1,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SOLVE_col4_rtol_m2n4(b_off,c1,c2,...) SOLVE_col4_ltor_m2n4(b_off,c1,c2,__VA_ARGS__)\ | |||
| "vblendps $8,%9,%%xmm0,%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1"; vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SUBTRACT_m4n2(b_off,c1,...) "vbroadcastsd "#b_off"("#__VA_ARGS__"),%%ymm0; vfnmadd231ps %%ymm0,%%ymm1,%%ymm"#c1";" | |||
| #define SUBTRACT_m8n2(b_off,c1,c2,...) SUBTRACT_m4n2(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%ymm0,%%ymm2,%%ymm"#c2";" | |||
| #define SUBTRACT_m1n4(b_off,c1,...) "vmovups "#b_off"("#__VA_ARGS__"),%%xmm0; vfnmadd231ps %%xmm0,%%xmm1,%%xmm"#c1";" | |||
| #define SUBTRACT_m2n4(b_off,c1,c2,...) SUBTRACT_m1n4(b_off,c1,__VA_ARGS__) "vfnmadd231ps %%xmm0,%%xmm2,%%xmm"#c2";" | |||
| #define SAVE_SOLUTION_m8n2(c1,c2,a_off)\ | |||
| "vunpcklps %%ymm"#c2",%%ymm"#c1",%%ymm0; vunpckhps %%ymm"#c2",%%ymm"#c1",%%ymm1;"\ | |||
| "vunpcklpd %%ymm1,%%ymm0,%%ymm"#c1"; vunpckhpd %%ymm1,%%ymm0,%%ymm"#c2";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%ymm"#c2","#a_off"+32(%0);"\ | |||
| "vmovups %%ymm"#c1",(%3); vmovups %%ymm"#c2",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m4n2(c1,a_off)\ | |||
| "vpermilps $216,%%ymm"#c1",%%ymm"#c1"; vpermpd $216,%%ymm"#c1",%%ymm"#c1";"\ | |||
| "vmovups %%ymm"#c1","#a_off"(%0); vmovups %%xmm"#c1",(%3); vextractf128 $1,%%ymm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m2n4(c1,c2,a_off)\ | |||
| "vunpcklps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vunpckhps %%xmm"#c2",%%xmm"#c1",%%xmm0; vmovups %%xmm0,"#a_off"+16(%0); vmovsd %%xmm0,(%3); vmovhpd %%xmm0,(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| #define SAVE_SOLUTION_m1n4(c1,a_off)\ | |||
| "vmovups %%xmm"#c1","#a_off"(%0); vmovss %%xmm"#c1",(%3); vextractps $1,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;"\ | |||
| "vextractps $2,%%xmm"#c1",(%3); vextractps $3,%%xmm"#c1",(%3,%4,1); leaq (%3,%4,2),%3;" | |||
| @@ -1,86 +1,86 @@ | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| include_directories(${PROJECT_BINARY_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||
| set(RELAFILES | |||
| clauum.c | |||
| ctrsyl_rec2.c | |||
| dsytrf.c | |||
| spbtrf.c | |||
| strsyl_rec2.c | |||
| zhetrf_rook_rec2.c | |||
| ztrsyl.c | |||
| cgbtrf.c | |||
| cpbtrf.c | |||
| ctrtri.c | |||
| dsytrf_rec2.c | |||
| spotrf.c | |||
| strtri.c | |||
| zlauum.c | |||
| ztrsyl_rec2.c | |||
| cgemmt.c | |||
| cpotrf.c | |||
| dgbtrf.c | |||
| dsytrf_rook.c | |||
| lapack_wrappers.c | |||
| ssygst.c | |||
| zgbtrf.c | |||
| zpbtrf.c | |||
| ztrtri.c | |||
| cgetrf.c | |||
| csytrf.c | |||
| dgemmt.c | |||
| dsytrf_rook_rec2.c | |||
| ssytrf.c | |||
| zgemmt.c | |||
| zpotrf.c | |||
| chegst.c | |||
| csytrf_rec2.c | |||
| dgetrf.c | |||
| dtgsyl.c | |||
| ssytrf_rec2.c | |||
| zgetrf.c | |||
| zsytrf.c | |||
| chetrf.c | |||
| csytrf_rook.c | |||
| dlauum.c | |||
| dtrsyl.c | |||
| sgbtrf.c | |||
| ssytrf_rook.c | |||
| zhegst.c | |||
| zsytrf_rec2.c | |||
| chetrf_rec2.c | |||
| csytrf_rook_rec2.c | |||
| dpbtrf.c | |||
| dtrsyl_rec2.c | |||
| sgemmt.c | |||
| ssytrf_rook_rec2.c | |||
| zhetrf.c | |||
| zsytrf_rook.c | |||
| chetrf_rook.c | |||
| ctgsyl.c | |||
| dpotrf.c | |||
| dtrtri.c | |||
| sgetrf.c | |||
| stgsyl.c | |||
| zhetrf_rec2.c | |||
| zsytrf_rook_rec2.c | |||
| chetrf_rook_rec2.c | |||
| ctrsyl.c | |||
| dsygst.c | |||
| f2c.c | |||
| slauum.c | |||
| strsyl.c | |||
| zhetrf_rook.c | |||
| ztgsyl.c | |||
| ) | |||
| # add relapack folder to the sources | |||
| set(RELA_SOURCES "") | |||
| foreach (RELA_FILE ${RELAFILES}) | |||
| list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||
| endforeach () | |||
| add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||
| set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
| include_directories(${PROJECT_SOURCE_DIR}) | |||
| include_directories(${PROJECT_BINARY_DIR}) | |||
| include_directories(${PROJECT_SOURCE_DIR}/relapack) | |||
| set(RELAFILES | |||
| clauum.c | |||
| ctrsyl_rec2.c | |||
| dsytrf.c | |||
| spbtrf.c | |||
| strsyl_rec2.c | |||
| zhetrf_rook_rec2.c | |||
| ztrsyl.c | |||
| cgbtrf.c | |||
| cpbtrf.c | |||
| ctrtri.c | |||
| dsytrf_rec2.c | |||
| spotrf.c | |||
| strtri.c | |||
| zlauum.c | |||
| ztrsyl_rec2.c | |||
| cgemmt.c | |||
| cpotrf.c | |||
| dgbtrf.c | |||
| dsytrf_rook.c | |||
| lapack_wrappers.c | |||
| ssygst.c | |||
| zgbtrf.c | |||
| zpbtrf.c | |||
| ztrtri.c | |||
| cgetrf.c | |||
| csytrf.c | |||
| dgemmt.c | |||
| dsytrf_rook_rec2.c | |||
| ssytrf.c | |||
| zgemmt.c | |||
| zpotrf.c | |||
| chegst.c | |||
| csytrf_rec2.c | |||
| dgetrf.c | |||
| dtgsyl.c | |||
| ssytrf_rec2.c | |||
| zgetrf.c | |||
| zsytrf.c | |||
| chetrf.c | |||
| csytrf_rook.c | |||
| dlauum.c | |||
| dtrsyl.c | |||
| sgbtrf.c | |||
| ssytrf_rook.c | |||
| zhegst.c | |||
| zsytrf_rec2.c | |||
| chetrf_rec2.c | |||
| csytrf_rook_rec2.c | |||
| dpbtrf.c | |||
| dtrsyl_rec2.c | |||
| sgemmt.c | |||
| ssytrf_rook_rec2.c | |||
| zhetrf.c | |||
| zsytrf_rook.c | |||
| chetrf_rook.c | |||
| ctgsyl.c | |||
| dpotrf.c | |||
| dtrtri.c | |||
| sgetrf.c | |||
| stgsyl.c | |||
| zhetrf_rec2.c | |||
| zsytrf_rook_rec2.c | |||
| chetrf_rook_rec2.c | |||
| ctrsyl.c | |||
| dsygst.c | |||
| f2c.c | |||
| slauum.c | |||
| strsyl.c | |||
| zhetrf_rook.c | |||
| ztgsyl.c | |||
| ) | |||
| # add relapack folder to the sources | |||
| set(RELA_SOURCES "") | |||
| foreach (RELA_FILE ${RELAFILES}) | |||
| list(APPEND RELA_SOURCES "${PROJECT_SOURCE_DIR}/relapack/src/${RELA_FILE}") | |||
| endforeach () | |||
| add_library(relapack_src OBJECT ${RELA_SOURCES}) | |||
| set_source_files_properties(${RELA_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||