Improvements to Aarch64 kernelstags/v0.2.19^2
| @@ -173,7 +173,9 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| smallscaling | |||
| smallscaling \ | |||
| isamax.goto idamax.goto icamax.goto izamax.goto \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -226,7 +228,9 @@ atlas :: slinpack.atlas dlinpack.atlas clinpack.atlas zlinpack.atlas \ | |||
| sgesv.atlas dgesv.atlas cgesv.atlas zgesv.atlas \ | |||
| sgetri.atlas dgetri.atlas cgetri.atlas zgetri.atlas \ | |||
| spotrf.atlas dpotrf.atlas cpotrf.atlas zpotrf.atlas \ | |||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas | |||
| ssymm.atlas dsymm.atlas csymm.atlas zsymm.atlas \ | |||
| isamax.atlas idamax.atlas icamax.atlas izamax.atlas \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto | |||
| mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| scholesky.mkl dcholesky.mkl ccholesky.mkl zcholesky.mkl \ | |||
| @@ -1937,6 +1941,63 @@ zgemm3m.mkl : zgemm3m.$(SUFFIX) | |||
| zgemm3m.veclib : zgemm3m.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## ISAMAX ############################################## | |||
| isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| isamax.atlas : isamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## IDAMAX ############################################## | |||
| idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| idamax.atlas : idamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## ICAMAX ############################################## | |||
| icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| icamax.atlas : icamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## IZAMAX ############################################## | |||
| izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| izamax.atlas : izamax.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## SNRM2 ############################################## | |||
| snrm2.goto : snrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| snrm2.atlas : snrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## DNRM2 ############################################## | |||
| dnrm2.goto : dnrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dnrm2.atlas : dnrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## Sscnrm2 ############################################## | |||
| scnrm2.goto : scnrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| scnrm2.atlas : scnrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ############################################## Ddznrm2 ############################################## | |||
| dznrm2.goto : dznrm2.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dznrm2.atlas : dznrm2.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ################################################################################################### | |||
| slinpack.$(SUFFIX) : linpack.c | |||
| @@ -2243,6 +2304,33 @@ cgemm3m.$(SUFFIX) : gemm3m.c | |||
| zgemm3m.$(SUFFIX) : gemm3m.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| isamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| idamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| icamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| izamax.$(SUFFIX) : iamax.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| snrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dnrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| scnrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| dznrm2.$(SUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm -lpthread | |||
| @@ -183,9 +183,9 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| #ifdef COMPLEX | |||
| fprintf(stderr, " %10.2f MFlops\n", 4. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); | |||
| #else | |||
| fprintf(stderr, " %10.2f MFlops\n", 2. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); | |||
| #endif | |||
| } | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MBytes\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MBytes %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -221,7 +221,7 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||
| } | |||
| } | |||
| @@ -258,7 +258,7 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6, timeg); | |||
| } | |||
| } | |||
| @@ -0,0 +1,190 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef IAMAX | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define IAMAX BLASFUNC(izamax) | |||
| #else | |||
| #define IAMAX BLASFUNC(icamax) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define IAMAX BLASFUNC(idamax) | |||
| #else | |||
| #define IAMAX BLASFUNC(isamax) | |||
| #endif | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Time\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| IAMAX (&m, x, &inc_x); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -0,0 +1,190 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef NRM2 | |||
| #ifdef COMPLEX | |||
| #ifdef DOUBLE | |||
| #define NRM2 BLASFUNC(dznrm2) | |||
| #else | |||
| #define NRM2 BLASFUNC(scnrm2) | |||
| #endif | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define NRM2 BLASFUNC(dnrm2) | |||
| #else | |||
| #define NRM2 BLASFUNC(snrm2) | |||
| #endif | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x; | |||
| blasint m, i; | |||
| blasint inc_x=1; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Time\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| NRM2 (&m, x, &inc_x); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, " %10.6f secs\n", timeg); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -186,8 +186,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -189,9 +189,9 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| #ifdef COMPLEX | |||
| fprintf(stderr, " %10.2f MFlops\n", 6. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 6. * (double)m / timeg * 1.e-6, timeg); | |||
| #else | |||
| fprintf(stderr, " %10.2f MFlops\n", 1. * (double)m / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 1. * (double)m / timeg * 1.e-6, timeg); | |||
| #endif | |||
| } | |||
| @@ -190,8 +190,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MBytes\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MBytes %10.6f sec\n", | |||
| COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -191,8 +191,8 @@ int main(int argc, char *argv[]){ | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6, time1); | |||
| } | |||
| @@ -184,8 +184,8 @@ int main(int argc, char *argv[]){ | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6); | |||
| " %10.2f MFlops %10.6f sec\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m / timeg * 1.e-6, timeg); | |||
| } | |||
| @@ -58,43 +58,43 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| str TMPF, [Y], #SZ | |||
| #else | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.2s}, [X], #8 | |||
| st1 {v0.2s}, [Y], #8 | |||
| ldr d0, [X], #8 | |||
| str d0, [Y], #8 | |||
| #else | |||
| ld1 {v0.2d}, [X], #16 | |||
| st1 {v0.2d}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| #endif | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| #if !defined(COMPLEX) | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| #else // DOUBLE | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| #endif | |||
| #else // COMPLEX | |||
| #if !defined(DOUBLE) | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| #else // DOUBLE | |||
| ld1 {v0.4s}, [X], #16 | |||
| ld1 {v1.4s}, [X], #16 | |||
| ld1 {v2.4s}, [X], #16 | |||
| ld1 {v3.4s}, [X], #16 | |||
| st1 {v0.4s}, [Y], #16 | |||
| st1 {v1.4s}, [Y], #16 | |||
| st1 {v2.4s}, [Y], #16 | |||
| st1 {v3.4s}, [Y], #16 | |||
| ldr q0, [X], #16 | |||
| str q0, [Y], #16 | |||
| ldr q1, [X], #16 | |||
| str q1, [Y], #16 | |||
| ldr q2, [X], #16 | |||
| str q2, [Y], #16 | |||
| ldr q3, [X], #16 | |||
| str q3, [Y], #16 | |||
| #endif | |||
| #endif | |||
| @@ -46,19 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define temp x16 | |||
| #define tempOffset x17 | |||
| #define tempK x18 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alpha x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -101,14 +101,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v05 pA1_2, pA1_3 | |||
| //v06 pA1_4, pA1_5 | |||
| //v07 pA1_6, pA1_7 | |||
| //v08 must save pB0_0, pB0_1 | |||
| //v09 must save pB0_2, pB0_3 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v12 must save pB1_0, pB1_1 | |||
| //v13 must save pB1_2, pB1_3 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 C04, C05 | |||
| @@ -150,186 +150,249 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v22.2d, v2.2d, v9.d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| fmul v23.2d, v3.2d, v8.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v9.d[0] | |||
| fmul v27.2d, v3.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| fmul v30.2d, v2.2d, v11.d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v30.2d, v2.2d, v9.d[1] | |||
| fmul v31.2d, v3.2d, v9.d[1] | |||
| ldp q6, q7, [pA], #32 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v31.2d, v3.2d, v11.d[0] | |||
| fmul v23.2d, v3.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| ldp d12, d13, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| ldp d14, d15, [pB], #16 | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| ldp q0, q1, [pA], #32 | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q2, q3, [pA], #32 | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| add pCRow1, pCRow0, LDC | |||
| fmov alpha0, alpha | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| stp q2, q3, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow2, LDC | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| stp q6, q7, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v0.2d, v24.2d, alphaV0 | |||
| fmul v1.2d, v25.2d, alphaV1 | |||
| fmul v2.2d, v26.2d, alphaV2 | |||
| fmul v3.2d, v27.2d, alphaV3 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow2] | |||
| fmul v1.2d, v25.2d, alphaV0 | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| fmul v2.2d, v26.2d, alphaV0 | |||
| fmul v3.2d, v27.2d, alphaV0 | |||
| stp q2, q3, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0 | |||
| fmul v5.2d, v29.2d, alphaV1 | |||
| fmul v6.2d, v30.2d, alphaV2 | |||
| fmul v7.2d, v31.2d, alphaV3 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmul v5.2d, v29.2d, alphaV0 | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v6.2d, v30.2d, alphaV0 | |||
| fmul v7.2d, v31.2d, alphaV0 | |||
| stp q6, q7, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -365,26 +428,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| fmul v9.2d, v25.2d, alphaV1 | |||
| fmul v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV2 | |||
| fmul v13.2d, v29.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| fmul v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -413,22 +477,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| fmul v8.2d, v24.2d, alphaV2 | |||
| fmul v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v28.2d, alphaV3 | |||
| fmul v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -453,6 +518,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -462,7 +529,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow1, pCRow2, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -502,18 +569,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmul v4.2d, v20.2d, alphaV0 | |||
| fmul v5.2d, v21.2d, alphaV1 | |||
| fmul v6.2d, v22.2d, alphaV2 | |||
| fmul v7.2d, v23.2d, alphaV3 | |||
| fmul v5.2d, v21.2d, alphaV0 | |||
| fmul v6.2d, v22.2d, alphaV0 | |||
| fmul v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -541,14 +609,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV2 | |||
| fmul v13.2d, v21.2d, alphaV3 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| fmul v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -573,12 +642,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v12.2d, v20.2d, alphaV1 | |||
| fmul v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -601,6 +671,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| @@ -636,10 +707,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| fmul v0.2d, v16.2d, alphaV0 | |||
| fmul v1.2d, v17.2d, alphaV1 | |||
| fmul v2.2d, v18.2d, alphaV2 | |||
| fmul v3.2d, v19.2d, alphaV3 | |||
| fmul v1.2d, v17.2d, alphaV0 | |||
| fmul v2.2d, v18.2d, alphaV0 | |||
| fmul v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -665,8 +737,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| fmul v9.2d, v17.2d, alphaV1 | |||
| fmul v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -690,6 +763,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| fmul v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -713,6 +787,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| fmul d8, d16, alpha0 | |||
| str d8, [pCRow0] | |||
| @@ -739,10 +814,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha0, d0 | |||
| fmov alpha1, d0 | |||
| fmov alpha2, d0 | |||
| fmov alpha3, d0 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -759,8 +834,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /******************************************************************************/ | |||
| dtrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -774,6 +854,7 @@ dtrmm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dtrmm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -794,40 +875,64 @@ dtrmm_kernel_L4_M8_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| asr counterL , tempK, #3 // L = K / 8 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| blt dtrmm_kernel_L4_M8_32 | |||
| KERNEL8x4_I // do one in the K | |||
| KERNEL8x4_M2 // do another in the K | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dtrmm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt dtrmm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| ble dtrmm_kernel_L4_M8_40 | |||
| KERNEL8x4_I | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_M2 | |||
| KERNEL8x4_M1 | |||
| KERNEL8x4_E | |||
| b dtrmm_kernel_L4_M8_44 | |||
| @@ -838,13 +943,17 @@ dtrmm_kernel_L4_M8_40: | |||
| dtrmm_kernel_L4_M8_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble dtrmm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dtrmm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne dtrmm_kernel_L4_M8_46 | |||
| dtrmm_kernel_L4_M8_100: | |||
| SAVE8x4 | |||
| @@ -864,6 +973,9 @@ dtrmm_kernel_L4_M8_100: | |||
| #if defined(LEFT) | |||
| add tempOffset, tempOffset, #8 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| dtrmm_kernel_L4_M8_END: | |||
| subs counterI, counterI, #1 | |||
| @@ -68,6 +68,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SHZ 3 | |||
| #endif | |||
| #define A_PRE_SIZE 768 | |||
| #define Y_PRE_SIZE 768 | |||
| /******************************************************************************/ | |||
| .macro SAVE_REGS | |||
| @@ -105,36 +108,42 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.4s, v3.4s}, [A_PTR], #32 | |||
| ld1 {v4.4s, v5.4s}, [Y_IPTR], #32 | |||
| fmla v4.4s, v1.4s, v2.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v5.4s, v1.4s, v3.4s | |||
| st1 {v4.4s, v5.4s}, [Y_OPTR], #32 | |||
| ld1 {v6.4s, v7.4s}, [A_PTR], #32 | |||
| ld1 {v8.4s, v9.4s}, [Y_IPTR], #32 | |||
| fmla v8.4s, v1.4s, v6.4s | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v9.4s, v1.4s, v7.4s | |||
| st1 {v8.4s, v9.4s}, [Y_OPTR], #32 | |||
| #else //DOUBLE | |||
| ld1 {v2.2d, v3.2d}, [A_PTR], #32 | |||
| ld1 {v4.2d, v5.2d}, [Y_IPTR], #32 | |||
| fmla v4.2d, v1.2d, v2.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v5.2d, v1.2d, v3.2d | |||
| st1 {v4.2d, v5.2d}, [Y_OPTR], #32 | |||
| ld1 {v6.2d, v7.2d}, [A_PTR], #32 | |||
| ld1 {v8.2d, v9.2d}, [Y_IPTR], #32 | |||
| fmla v8.2d, v1.2d, v6.2d | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v9.2d, v1.2d, v7.2d | |||
| st1 {v8.2d, v9.2d}, [Y_OPTR], #32 | |||
| ld1 {v10.2d, v11.2d}, [A_PTR], #32 | |||
| ld1 {v12.2d, v13.2d}, [Y_IPTR], #32 | |||
| fmla v12.2d, v1.2d, v10.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v13.2d, v1.2d, v11.2d | |||
| st1 {v12.2d, v13.2d}, [Y_OPTR], #32 | |||
| ld1 {v14.2d, v15.2d}, [A_PTR], #32 | |||
| ld1 {v16.2d, v17.2d}, [Y_IPTR], #32 | |||
| fmla v16.2d, v1.2d, v14.2d | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v17.2d, v1.2d, v15.2d | |||
| st1 {v16.2d, v17.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define X_PREFETCH_SIZE 768 | |||
| #define A_PREFETCH_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -112,42 +115,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v5.4s, v6.4s, v7.4s, v8.4s}, [A_PTR], #64 | |||
| ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v5.4s, v9.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.4s, v6.4s, v10.4s | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.4s, v7.4s, v11.4s | |||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||
| fmla v4.4s, v8.4s, v12.4s | |||
| ld1 {v13.4s, v14.4s, v15.4s, v16.4s}, [A_PTR], #64 | |||
| ld1 {v17.4s, v18.4s, v19.4s, v20.4s}, [X_PTR], #64 | |||
| fmla v1.4s, v13.4s, v17.4s | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.4s, v14.4s, v18.4s | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.4s, v15.4s, v19.4s | |||
| fmla v4.4s, v16.4s, v20.4s | |||
| #else | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| ld1 {v5.2d, v6.2d, v7.2d, v8.2d}, [A_PTR], #64 | |||
| ld1 {v9.2d, v10.2d, v11.2d, v12.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v5.2d, v9.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v6.2d, v10.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v7.2d, v11.2d | |||
| fmla v4.2d, v8.2d, v12.2d | |||
| ld1 {v13.2d, v14.2d, v15.2d, v16.2d}, [A_PTR], #64 | |||
| ld1 {v17.2d, v18.2d, v19.2d, v20.2d}, [X_PTR], #64 | |||
| fmla v1.2d, v13.2d, v17.2d | |||
| prfm PLDL1KEEP, [A_PTR, #A_PREFETCH_SIZE] | |||
| fmla v2.2d, v14.2d, v18.2d | |||
| prfm PLDL1KEEP, [X_PTR, #X_PREFETCH_SIZE] | |||
| fmla v3.2d, v15.2d, v19.2d | |||
| fmla v4.2d, v16.2d, v20.2d | |||
| #endif | |||
| @@ -72,6 +72,148 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| fabs MAXF, MAXF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ldp q2, q3, [X], #32 | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fmax v2.4s, v2.4s, v3.4s | |||
| fmaxv TMPF, v2.4s | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #else | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v3.2d | |||
| fmax v4.2d, v4.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v4.2d | |||
| fmaxp TMPF, v2.2d | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #endif | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| sub x6, INDEX, #1 | |||
| #if !defined(DOUBLE) | |||
| lsl x6, x6, #2 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| ins v4.s[0], v3.s[0] | |||
| ins v5.s[0], v3.s[1] | |||
| ins v6.s[0], v3.s[2] | |||
| ins v7.s[0], v3.s[3] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, s7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[0] | |||
| ins v5.s[0], v2.s[1] | |||
| ins v6.s[0], v2.s[2] | |||
| ins v7.s[0], v2.s[3] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #else | |||
| add x6, x6, #4 | |||
| lsl x6, x6, #3 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| ins v4.d[0], v2.d[0] | |||
| ins v5.d[0], v2.d[1] | |||
| ins v6.d[0], v3.d[0] | |||
| ins v7.d[0], v3.d[1] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d4 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x7, x7, #32 | |||
| ldp q2, q3, [x7] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| ins v4.d[0], v2.d[0] | |||
| ins v5.d[0], v2.d[1] | |||
| ins v6.d[0], v3.d[0] | |||
| ins v7.d[0], v3.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d6 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d5 | |||
| csel INDEX, x6, INDEX, eq | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPVF, [X], INC_X | |||
| add Z, Z, #1 | |||
| @@ -92,6 +234,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq iamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| @@ -78,6 +78,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fabs v4.4s, v4.4s | |||
| fabs v5.4s, v5.4s | |||
| faddp v2.4s, v2.4s, v3.4s | |||
| faddp v3.4s, v4.4s, v5.4s | |||
| fmax v2.4s, v2.4s, v3.4s | |||
| fmaxv TMPF, v2.4s | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #else | |||
| ldp q2, q3, [X], #32 | |||
| ldp q4, q5, [X], #32 | |||
| ldp q16, q17, [X], #32 | |||
| ldp q18, q19, [X], #32 | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fabs v16.2d, v16.2d | |||
| fabs v17.2d, v17.2d | |||
| fabs v18.2d, v18.2d | |||
| fabs v19.2d, v19.2d | |||
| faddp v2.2d, v2.2d, v3.2d | |||
| faddp v3.2d, v4.2d, v5.2d | |||
| faddp v4.2d, v16.2d, v17.2d | |||
| faddp v5.2d, v18.2d, v19.2d | |||
| fmax v2.2d, v2.2d, v3.2d | |||
| fmax v4.2d, v4.2d, v5.2d | |||
| fmax v2.2d, v2.2d, v4.2d | |||
| fmaxp TMPF, v2.2d | |||
| fcmp MAXF, TMPF | |||
| fcsel MAXF, MAXF, TMPF, COND | |||
| csel INDEX, INDEX, Z, COND | |||
| add Z, Z, #8 | |||
| #endif | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| sub x6, INDEX, #1 | |||
| #if !defined(DOUBLE) | |||
| lsl x6, x6, #3 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| ldp q4, q5, [x7, #32] | |||
| fabs v2.4s, v2.4s | |||
| fabs v3.4s, v3.4s | |||
| fabs v4.4s, v4.4s | |||
| fabs v5.4s, v5.4s | |||
| faddp v2.4s, v2.4s, v3.4s | |||
| faddp v3.4s, v4.4s, v5.4s | |||
| ins v4.s[0], v3.s[3] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[2] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v3.s[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[3] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[2] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v4.s[0], v2.s[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, s4 | |||
| csel INDEX, x6, INDEX, eq | |||
| #else | |||
| lsl x6, x6, #4 | |||
| add x7, x7, x6 | |||
| ldp q2, q3, [x7] | |||
| ldp q4, q5, [x7, #32] | |||
| ldp q16, q17, [x7, #64] | |||
| ldp q18, q19, [x7, #96] | |||
| fabs v2.2d, v2.2d | |||
| fabs v3.2d, v3.2d | |||
| fabs v4.2d, v4.2d | |||
| fabs v5.2d, v5.2d | |||
| fabs v16.2d, v16.2d | |||
| fabs v17.2d, v17.2d | |||
| fabs v18.2d, v18.2d | |||
| fabs v19.2d, v19.2d | |||
| faddp v2.2d, v2.2d, v3.2d | |||
| faddp v3.2d, v4.2d, v5.2d | |||
| faddp v4.2d, v16.2d, v17.2d | |||
| faddp v5.2d, v18.2d, v19.2d | |||
| ins v7.d[0], v5.d[1] | |||
| add x6, INDEX, #7 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v5.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v4.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v4.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v3.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v3.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v2.d[1] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| ins v7.d[0], v2.d[0] | |||
| sub x6, x6, #1 | |||
| fcmp MAXF, d7 | |||
| csel INDEX, x6, INDEX, eq | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v1.2s}, [X], INC_X | |||
| @@ -107,6 +280,50 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp INC_X, xzr | |||
| ble iamax_kernel_zero | |||
| cmp INC_X, #1 | |||
| bne iamax_kernel_S_BEGIN | |||
| mov x7, X | |||
| iamax_kernel_F_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| ble iamax_kernel_L999 | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| ble iamax_kernel_F1 | |||
| add Z, Z, #1 | |||
| iamax_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| sub Z, Z, #1 | |||
| iamax_kernel_F1: | |||
| ands I, N, #7 | |||
| ble iamax_kernel_L999 | |||
| iamax_kernel_F10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne iamax_kernel_F10 | |||
| b iamax_kernel_L999 | |||
| iamax_kernel_S_BEGIN: | |||
| INIT_S | |||
| subs N, N, #1 | |||
| @@ -46,20 +46,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -98,10 +97,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save | |||
| // 20 must save | |||
| // 21 must save | |||
| @@ -175,12 +174,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -193,16 +188,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -215,6 +202,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -226,6 +216,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -237,6 +244,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -248,6 +258,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -259,6 +272,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -270,14 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -286,7 +294,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -294,15 +302,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -310,22 +318,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -344,7 +352,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -352,15 +360,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -368,22 +376,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -412,6 +420,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -422,6 +432,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -441,33 +453,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -490,74 +509,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| ld2 {v0.2d, v1.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ld2 {v0.2d, v1.2d}, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmla v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmla v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| ld2 {v4.2d, v5.2d}, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmla v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -604,18 +634,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -623,8 +651,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -632,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmla v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmla v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -641,8 +669,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmla v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmla v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -691,18 +719,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -710,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -719,8 +745,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmla d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmla d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -728,8 +754,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmla d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmla d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -778,25 +804,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -804,15 +828,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmla v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmla v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmla v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -845,18 +869,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -864,8 +886,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.2d, v5.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmla v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmla v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -898,18 +920,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| @@ -917,8 +937,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v4.d, v5.d}[0], [pCRow1] | |||
| fmla d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmla d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmla d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -953,25 +973,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| ld2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmla v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmla v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmla v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -997,18 +1015,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.2d, v1.2d}, [pCRow1] | |||
| fmla v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmla v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmla v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1035,18 +1051,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| ld2 {v0.d, v1.d}[0], [pCRow1] | |||
| fmla d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmla d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmla d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1072,8 +1086,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1085,8 +1102,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble zgemm_kernel_L2_BEGIN | |||
| zgemm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| zgemm_kernel_L4_M4_BEGIN: | |||
| @@ -1096,42 +1118,68 @@ zgemm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble zgemm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_20: | |||
| mov pB, origPB | |||
| asr counterL , origK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , origK, #3 | |||
| cmp counterL , #2 | |||
| blt zgemm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble zgemm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt zgemm_kernel_L4_M4_22 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble zgemm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b zgemm_kernel_L4_M4_44 | |||
| @@ -1143,13 +1191,20 @@ zgemm_kernel_L4_M4_40: | |||
| zgemm_kernel_L4_M4_44: | |||
| ands counterL , origK, #1 | |||
| ands counterL , origK, #7 | |||
| ble zgemm_kernel_L4_M4_100 | |||
| .align 5 | |||
| zgemm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne zgemm_kernel_L4_M4_46 | |||
| zgemm_kernel_L4_M4_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE4x4 | |||
| @@ -43,6 +43,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define Y_OPTR x13 /* loop Y vector address */ | |||
| #define X_PTR x14 /* loop X vector address */ | |||
| #define A_PRE_SIZE 768 | |||
| #define Y_PRE_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -50,14 +53,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if !defined(DOUBLE) | |||
| #define ALPHA_R s0 | |||
| #define ALPHA_I s1 | |||
| #define ALPHA_R_COPY s7 | |||
| #define ALPHA_I_COPY s8 | |||
| #define SHZ 3 | |||
| #else | |||
| #define ALPHA_R d0 | |||
| #define ALPHA_I d1 | |||
| #define ALPHA_R_COPY d7 | |||
| #define ALPHA_I_COPY d8 | |||
| #define SHZ 4 | |||
| #endif | |||
| @@ -95,20 +94,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro INIT | |||
| /********** INIT FOR F4 LOOP **********/ | |||
| fmov ALPHA_R_COPY, ALPHA_R | |||
| fmov ALPHA_I_COPY, ALPHA_I | |||
| #if !defined(DOUBLE) | |||
| ins v7.s[1], v7.s[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.s[1], v8.s[0] // I(ALPHA), I(ALPHA) | |||
| ins v7.d[1], v7.d[0] | |||
| ins v8.d[1], v8.d[0] | |||
| #else | |||
| ins v7.d[1], v7.d[0] // R(ALPHA), R(ALPHA) | |||
| ins v8.d[1], v8.d[0] // I(ALPHA), I(ALPHA) | |||
| #endif | |||
| /******* INIT FOR F1 AND S1 LOOP ******/ | |||
| #if !defined(DOUBLE) | |||
| ins v0.s[1], v0.s[0] // R(ALPHA), R(ALPHA) | |||
| eor v2.16b, v2.16b, v2.16b | |||
| @@ -129,47 +114,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro INIT_LOOP | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| #if !defined(DOUBLE) | |||
| ld1 {v9.2s}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.s[0], v9.s[1] | |||
| ins v9.s[1], v9.s[0] // [R(X), R(X)] | |||
| ins v10.s[1], v10.s[0] // [I(X), I(X)] | |||
| ins v9.d[1], v9.d[0] | |||
| ins v10.d[1], v10.d[0] | |||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||
| fmul v2.2s, v0.2s, v2.2s | |||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||
| ins v3.s[0], v2.s[1] | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| dup v22.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v23.4s, v25.s[0] // -I[TEMP] | |||
| dup v24.4s, v3.s[0] // I[TEMP] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmla v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v9.4s, v8.4s // [+ R(X) * I(ALPHA)] | |||
| fmls v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| dup v22.4s, v2.s[0] // R[TEMP] | |||
| dup v23.4s, v3.s[0] // I[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v24.4s, v25.s[0] // -I[TEMP] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [+ I(X) * I(ALPHA)] | |||
| fmul v12.4s, v10.4s, v7.4s // [+ I(X) * R(ALPHA)] | |||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s2 | |||
| dup v22.4s, v25.s[0] // R[TEMP] | |||
| dup v23.4s, v3.s[0] // I[TEMP] | |||
| dup v24.4s, v3.s[0] // I[TEMP] | |||
| #else | |||
| fmul v11.4s, v9.4s, v7.4s // [+ R(X) * R(ALPHA)] | |||
| fmls v11.4s, v10.4s, v8.4s // [- I(X) * I(ALPHA)] | |||
| eor v12.16b, v12.16b, v12.16b | |||
| fmls v12.4s, v9.4s, v8.4s // [- R(X) * I(ALPHA)] | |||
| fmla v12.4s, v10.4s, v7.4s // [- I(X) * R(ALPHA)] | |||
| dup v21.4s, v2.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s2 | |||
| dup v22.4s, v25.s[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub s25, s25, s3 | |||
| dup v23.4s, v25.s[0] // I[TEMP] | |||
| dup v24.4s, v25.s[0] // I[TEMP] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2s}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.8b, v2.8b, v2.8b, #4 // [R(X), I(X)] | |||
| fmul v2.2s, v0.2s, v2.2s | |||
| fmla v2.2s, v1.2s, v3.2s // [I(TEMP), R(TEMP)] | |||
| ins v3.s[0], v2.s[1] | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| eor v4.16b, v4.16b, v4.16b | |||
| @@ -200,45 +191,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif // CONJ | |||
| #else // DOUBLE | |||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||
| fmul v2.2d, v0.2d, v2.2d | |||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||
| /********** INIT_LOOP FOR F4 LOOP **********/ | |||
| ld1 {v9.2d}, [X_PTR] // [I(X), R(X)] | |||
| ins v10.d[0], v9.d[1] | |||
| ins v9.d[1], v9.d[0] // [R(X), R(X)] | |||
| ins v10.d[1], v10.d[0] // [I(X), I(X)] | |||
| /****** INIT_LOOP FOR F4 LOOP ******/ | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| dup v22.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v23.2d, v25.d[0] // -I[TEMP] | |||
| dup v24.2d, v3.d[0] // I[TEMP] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmla v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v9.2d, v8.2d // [+ R(X) * I(ALPHA)] | |||
| fmls v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| dup v22.2d, v2.d[0] // R[TEMP] | |||
| dup v23.2d, v3.d[0] // I[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v24.2d, v25.d[0] // -I[TEMP] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [+ I(X) * I(ALPHA)] | |||
| fmul v12.2d, v10.2d, v7.2d // [+ I(X) * R(ALPHA)] | |||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d2 | |||
| dup v22.2d, v25.d[0] // R[TEMP] | |||
| dup v23.2d, v3.d[0] // I[TEMP] | |||
| dup v24.2d, v3.d[0] // I[TEMP] | |||
| #else | |||
| fmul v11.2d, v9.2d, v7.2d // [+ R(X) * R(ALPHA)] | |||
| fmls v11.2d, v10.2d, v8.2d // [- I(X) * I(ALPHA)] | |||
| eor v12.16b, v12.16b, v12.16b | |||
| fmls v12.2d, v9.2d, v8.2d // [- R(X) * I(ALPHA)] | |||
| fmla v12.2d, v10.2d, v7.2d // [- I(X) * R(ALPHA)] | |||
| dup v21.2d, v2.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d2 | |||
| dup v22.2d, v25.d[0] // R[TEMP] | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fsub d25, d25, d3 | |||
| dup v23.2d, v25.d[0] // I[TEMP] | |||
| dup v24.2d, v25.d[0] // I[TEMP] | |||
| #endif | |||
| #endif // CONJ | |||
| /****** INIT_LOOP FOR F1 AND S1 LOOP ******/ | |||
| ld1 {v2.2d}, [X_PTR] // [I(X), R(X)] | |||
| ext v3.16b, v2.16b, v2.16b, #8 // [R(X), I(X)] | |||
| fmul v2.2d, v0.2d, v2.2d | |||
| fmla v2.2d, v1.2d, v3.2d // [I(TEMP), R(TEMP)] | |||
| ins v3.d[0], v2.d[1] // I(TEMP) | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| eor v4.16b, v4.16b, v4.16b | |||
| @@ -276,91 +274,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| ld2 {v15.4s, v16.4s}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.4s, v11.4s, v14.4s // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.4s, v12.4s, v14.4s // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmla v16.4s, v12.4s, v13.4s // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.4s, v11.4s, v13.4s // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.4s, v12.4s, v14.4s // [- I(ALPHA * X) * A_I] | |||
| fmls v16.4s, v11.4s, v14.4s // [- R(ALPHA * X) * A_I] | |||
| fmls v16.4s, v12.4s, v13.4s // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v15.4s, v21.4s, v13.4s | |||
| fmla v15.4s, v23.4s, v14.4s | |||
| fmla v16.4s, v22.4s, v14.4s | |||
| fmla v16.4s, v24.4s, v13.4s | |||
| st2 {v15.4s, v16.4s}, [Y_OPTR], #32 | |||
| #else // DOUBLE | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| ld2 {v15.2d, v16.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmla v16.2d, v11.2d, v14.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v15.2d, v12.2d, v14.2d // [+ I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmla v16.2d, v12.2d, v13.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v15.2d, v11.2d, v13.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v15.2d, v12.2d, v14.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v16.2d, v11.2d, v14.2d // [- R(ALPHA * X) * A_I] | |||
| fmls v16.2d, v12.2d, v13.2d // [- I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [A_PTR, #A_PRE_SIZE] | |||
| fmla v15.2d, v21.2d, v13.2d | |||
| fmla v15.2d, v23.2d, v14.2d | |||
| fmla v16.2d, v22.2d, v14.2d | |||
| fmla v16.2d, v24.2d, v13.2d | |||
| st2 {v15.2d, v16.2d}, [Y_OPTR], #32 | |||
| ld2 {v17.2d, v18.2d}, [A_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [Y_IPTR], #32 | |||
| #if !defined(CONJ) | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmla v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #else // CONJ | |||
| #if !defined(XCONJ) | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmla v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmla v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #else | |||
| fmla v19.2d, v11.2d, v17.2d // [+ R(ALPHA * X) * A_R] | |||
| fmls v19.2d, v12.2d, v18.2d // [- I(ALPHA * X) * A_I] | |||
| fmls v20.2d, v11.2d, v18.2d // [+ R(ALPHA * X) * A_I] | |||
| fmls v20.2d, v12.2d, v17.2d // [+ I(ALPHA * X) * A_R] | |||
| #endif | |||
| #endif // CONJ | |||
| prfm PLDL1KEEP, [Y_IPTR, #Y_PRE_SIZE] | |||
| fmla v19.2d, v21.2d, v17.2d | |||
| fmla v19.2d, v23.2d, v18.2d | |||
| fmla v20.2d, v22.2d, v18.2d | |||
| fmla v20.2d, v24.2d, v17.2d | |||
| st2 {v19.2d, v20.2d}, [Y_OPTR], #32 | |||
| #endif | |||
| @@ -445,10 +391,7 @@ zgemv_n_kernel_F_LOOP: | |||
| zgemv_n_kernel_F4: | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F1 | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne zgemv_n_kernel_F4 | |||
| @@ -41,6 +41,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define J x11 /* loop variable */ | |||
| #define I x12 /* loop variable */ | |||
| #define A_PRE_SIZE 768 | |||
| #define X_PRE_SIZE 768 | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| @@ -139,6 +142,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v11.4s, v12.4s}, [X_PTR], #32 | |||
| ld2 {v13.4s, v14.4s}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v9.4s, v11.4s, v13.4s // [+ R(X) * A_R] | |||
| @@ -155,7 +160,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else // DOUBLE | |||
| ld2 {v11.2d, v12.2d}, [X_PTR], #32 | |||
| ld2 {v13.2d, v14.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [X_PTR, #512] | |||
| prfm PLDL1STRM, [X_PTR, #X_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v9.2d, v11.2d, v13.2d // [+ R(X) * A_R] | |||
| @@ -171,7 +176,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v17.2d, v18.2d}, [X_PTR], #32 | |||
| ld2 {v19.2d, v20.2d}, [A_PTR], #32 | |||
| prfm PLDL1STRM, [A_PTR, #512] | |||
| prfm PLDL1STRM, [A_PTR, #A_PRE_SIZE] | |||
| #if (!defined(CONJ) && !defined(XCONJ)) || (defined(CONJ) && defined(XCONJ)) | |||
| fmla v15.2d, v17.2d, v19.2d // [+ R(X) * A_R] | |||
| @@ -46,23 +46,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define pCRow0 x12 | |||
| #define pCRow1 x13 | |||
| #define pCRow2 x14 | |||
| #define pA x15 | |||
| #define alpha_save_R x16 | |||
| #define alpha_save_I x17 | |||
| #define temp x18 | |||
| #define tempOffset x19 | |||
| #define tempK x20 | |||
| #define pCRow3 x15 | |||
| #define pA x16 | |||
| #define alphaR x17 | |||
| #define alphaI x18 | |||
| #define temp x19 | |||
| #define tempOffset x20 | |||
| #define tempK x21 | |||
| #define alpha0_R d10 | |||
| #define alphaV0_R v10.d[0] | |||
| #define alpha0_I d11 | |||
| #define alphaV0_I v11.d[0] | |||
| #define alpha1_R d14 | |||
| #define alphaV1_R v14.d[0] | |||
| #define alpha1_I d15 | |||
| #define alphaV1_I v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define OP_rr fmla | |||
| @@ -93,7 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 04 origPB | |||
| // 05 pC | |||
| // 06 origLDC -> LDC | |||
| // 07 offset | |||
| // 07 offset -> temp | |||
| // 08 counterL | |||
| // 09 counterI | |||
| // 10 counterJ | |||
| @@ -101,13 +100,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 alpha_save_R | |||
| // 17 alpha_save_I | |||
| // 18 must save temp | |||
| // 19 must save tempOffset | |||
| // 20 must save tempK | |||
| // 21 must save | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 alpha_save_R | |||
| // 18 must save alpha_save_I | |||
| // 19 must save temp | |||
| // 20 must save tempOffset | |||
| // 21 must save tempK | |||
| // 22 must save | |||
| // 23 must save | |||
| // 24 must save | |||
| @@ -178,12 +177,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_I | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| @@ -196,16 +191,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| @@ -218,6 +205,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -229,6 +219,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -240,6 +247,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -251,6 +261,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -262,6 +275,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| @@ -273,14 +288,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| @@ -289,7 +297,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| @@ -297,15 +305,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v4.2d, v5.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| ld2 {v6.2d, v7.2d} , [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| @@ -313,22 +321,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v14.2d, v15.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| @@ -347,7 +355,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| @@ -355,15 +363,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| @@ -371,22 +379,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| @@ -415,6 +423,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| @@ -425,6 +435,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE+64] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| @@ -444,33 +456,40 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL4x4_SUB | |||
| ld2 {v8.2d, v9.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| @@ -493,66 +512,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| add pCRow1, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| fmul v2.2d, v26.2d, alphaV0_R | |||
| fmls v2.2d, v27.2d, alphaV0_I | |||
| fmul v3.2d, v26.2d, alphaV1_I | |||
| fmla v3.2d, v27.2d, alphaV1_R | |||
| fmul v3.2d, v26.2d, alphaV0_I | |||
| fmla v3.2d, v27.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| add pCRow2, pCRow2, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| fmul v6.2d, v30.2d, alphaV0_R | |||
| fmls v6.2d, v31.2d, alphaV0_I | |||
| fmul v7.2d, v30.2d, alphaV1_I | |||
| fmla v7.2d, v31.2d, alphaV1_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| fmul v7.2d, v30.2d, alphaV0_I | |||
| fmla v7.2d, v31.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow3] | |||
| add pCRow0, pCRow0, #64 | |||
| add pCRow3, pCRow3, #32 | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -599,41 +629,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v0.2d, v24.2d, alphaV0_R | |||
| fmls v0.2d, v25.2d, alphaV0_I | |||
| fmul v1.2d, v24.2d, alphaV1_I | |||
| fmla v1.2d, v25.2d, alphaV1_R | |||
| fmul v1.2d, v24.2d, alphaV0_I | |||
| fmla v1.2d, v25.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v28.2d, alphaV0_R | |||
| fmls v4.2d, v29.2d, alphaV0_I | |||
| fmul v5.2d, v28.2d, alphaV1_I | |||
| fmla v5.2d, v29.2d, alphaV1_R | |||
| fmul v5.2d, v28.2d, alphaV0_I | |||
| fmla v5.2d, v29.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -682,41 +710,39 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d0, d24, alphaV0_R | |||
| fmls d0, d25, alphaV0_I | |||
| fmul d1, d24, alphaV1_I | |||
| fmla d1, d25, alphaV1_R | |||
| fmul d1, d24, alphaV0_I | |||
| fmla d1, d25, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d28, alphaV0_R | |||
| fmls d4, d29, alphaV0_I | |||
| fmul d5, d28, alphaV1_I | |||
| fmla d5, d29, alphaV1_R | |||
| fmul d5, d28, alphaV0_I | |||
| fmla d5, d29, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -765,37 +791,35 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v6.2d, v22.2d, alphaV0_R | |||
| fmls v6.2d, v23.2d, alphaV0_I | |||
| fmul v7.2d, v22.2d, alphaV1_I | |||
| fmla v7.2d, v23.2d, alphaV1_R | |||
| fmul v7.2d, v22.2d, alphaV0_I | |||
| fmla v7.2d, v23.2d, alphaV0_R | |||
| st2 {v6.2d, v7.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -828,25 +852,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul v4.2d, v20.2d, alphaV0_R | |||
| fmls v4.2d, v21.2d, alphaV0_I | |||
| fmul v5.2d, v20.2d, alphaV1_I | |||
| fmla v5.2d, v21.2d, alphaV1_R | |||
| fmul v5.2d, v20.2d, alphaV0_I | |||
| fmla v5.2d, v21.2d, alphaV0_R | |||
| st2 {v4.2d, v5.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -879,25 +901,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow1, pCRow1, LDC | |||
| fmul d4, d20, alphaV0_R | |||
| fmls d4, d21, alphaV0_I | |||
| fmul d5, d20, alphaV1_I | |||
| fmla d5, d21, alphaV1_R | |||
| fmul d5, d20, alphaV0_I | |||
| fmla d5, d21, alphaV0_R | |||
| st2 {v4.d, v5.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -932,23 +952,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, #32 | |||
| fmul v2.2d, v18.2d, alphaV0_R | |||
| fmls v2.2d, v19.2d, alphaV0_I | |||
| fmul v3.2d, v18.2d, alphaV1_I | |||
| fmla v3.2d, v19.2d, alphaV1_R | |||
| fmul v3.2d, v18.2d, alphaV0_I | |||
| fmla v3.2d, v19.2d, alphaV0_R | |||
| st2 {v2.2d, v3.2d}, [pCRow2] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -974,17 +992,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul v0.2d, v16.2d, alphaV0_R | |||
| fmls v0.2d, v17.2d, alphaV0_I | |||
| fmul v1.2d, v16.2d, alphaV1_I | |||
| fmla v1.2d, v17.2d, alphaV1_R | |||
| fmul v1.2d, v16.2d, alphaV0_I | |||
| fmla v1.2d, v17.2d, alphaV0_R | |||
| st2 {v0.2d, v1.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -1011,17 +1027,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0_R, alpha_save_R | |||
| fmov alpha0_I, alpha_save_I | |||
| fmov alpha1_R, alpha0_R | |||
| fmov alpha1_I, alpha0_I | |||
| fmov alpha0_R, alphaR | |||
| fmov alpha0_I, alphaI | |||
| mov pCRow1, pCRow0 | |||
| fmul d0, d16, alphaV0_R | |||
| fmls d0, d17, alphaV0_I | |||
| fmul d1, d16, alphaV1_I | |||
| fmla d1, d17, alphaV1_R | |||
| fmul d1, d16, alphaV0_I | |||
| fmla d1, d17, alphaV0_R | |||
| st2 {v0.d, v1.d}[0], [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -1047,8 +1061,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| fmov alpha_save_R, d0 | |||
| fmov alpha_save_I, d1 | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alphaR, d0 | |||
| fmov alphaI, d1 | |||
| lsl LDC, LDC, #4 // ldc = ldc * 2 * 8 | |||
| @@ -1064,8 +1081,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ble ztrmm_kernel_L2_BEGIN | |||
| ztrmm_kernel_L4_BEGIN: | |||
| mov pCRow0, pC // pCRow0 = C | |||
| add pC, pC, LDC, lsl #2 | |||
| mov pCRow0, pC | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| #if defined(LEFT) | |||
| mov tempOffset, offset | |||
| @@ -1079,6 +1101,7 @@ ztrmm_kernel_L4_M4_BEGIN: | |||
| cmp counterI, #0 | |||
| ble ztrmm_kernel_L4_M2_BEGIN | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_20: | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| @@ -1098,39 +1121,64 @@ ztrmm_kernel_L4_M4_20: | |||
| add tempK, tempOffset, #4 | |||
| #endif | |||
| asr counterL , tempK, #1 // L = K / 2 | |||
| cmp counterL , #2 // is there at least 4 to do? | |||
| asr counterL , tempK, #3 | |||
| cmp counterL , #2 | |||
| blt ztrmm_kernel_L4_M4_32 | |||
| KERNEL4x4_I // do one in the K | |||
| KERNEL4x4_M2 // do another in the K | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #2 | |||
| ble ztrmm_kernel_L4_M4_22a | |||
| .align 5 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| subs counterL, counterL, #1 | |||
| bgt ztrmm_kernel_L4_M4_22 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_22a: | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_32: | |||
| tst counterL, #1 | |||
| ble ztrmm_kernel_L4_M4_40 | |||
| KERNEL4x4_I | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_M2 | |||
| KERNEL4x4_M1 | |||
| KERNEL4x4_E | |||
| b ztrmm_kernel_L4_M4_44 | |||
| @@ -1142,12 +1190,16 @@ ztrmm_kernel_L4_M4_40: | |||
| ztrmm_kernel_L4_M4_44: | |||
| ands counterL , tempK, #1 | |||
| ands counterL , tempK, #7 | |||
| ble ztrmm_kernel_L4_M4_100 | |||
| .align 5 | |||
| ztrmm_kernel_L4_M4_46: | |||
| KERNEL4x4_SUB | |||
| subs counterL, counterL, #1 | |||
| bne ztrmm_kernel_L4_M4_46 | |||
| ztrmm_kernel_L4_M4_100: | |||
| SAVE4x4 | |||
| @@ -1167,6 +1219,10 @@ ztrmm_kernel_L4_M4_100: | |||
| add tempOffset, tempOffset, #4 | |||
| #endif | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| ztrmm_kernel_L4_M4_END: | |||
| subs counterI, counterI, #1 | |||
| bne ztrmm_kernel_L4_M4_20 | |||
| @@ -2341,13 +2341,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||