| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) | |||
| project(OpenBLAS) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 2) | |||
| set(OpenBLAS_PATCH_VERSION 17) | |||
| set(OpenBLAS_PATCH_VERSION 18) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| enable_language(ASM) | |||
| @@ -147,5 +147,6 @@ In chronological order: | |||
| * [2016-03-14] Additional functional Assembly Kernels for Cortex-A57 | |||
| * [2016-03-14] Optimize Dgemm 4x4 for Cortex-A57 | |||
| * [Your name or handle] <[email or website]> | |||
| * [Date] [Brief summary of your changes] | |||
| * theoractice <https://github.com/theoractice/> | |||
| * [2016-03-20] Fix compiler error in VisualStudio with CMake | |||
| * [2016-03-22] Fix access violation on Windows while static linking | |||
| @@ -1,4 +1,22 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.2.18 | |||
| 12-Apr-2016 | |||
| common: | |||
| * If you set MAKE_NB_JOBS flag less or equal than zero, | |||
| make will be without -j. | |||
| x86/x86_64: | |||
| * Support building Visual Studio static library. (#813, Thanks, theoractice) | |||
| * Fix bugs to pass buidbot CI tests (http://build.openblas.net) | |||
| ARM: | |||
| * Provide DGEMM 8x4 kernel for Cortex-A57 (Thanks, Ashwin Sekhar T K) | |||
| POWER: | |||
| * Optimize S and C BLAS3 on Power8 | |||
| * Optimize BLAS2/1 on Power8 | |||
| ==================================================================== | |||
| Version 0.2.17 | |||
| 20-Mar-2016 | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.2.17 | |||
| VERSION = 0.2.18 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -112,7 +112,10 @@ NO_AFFINITY = 1 | |||
| # NO_PARALLEL_MAKE = 1 | |||
| # Force number of make jobs. The default is the number of logical CPU of the host. | |||
| # This is particularly useful when using distcc | |||
| # This is particularly useful when using distcc. | |||
| # A negative value will disable adding a -j flag to make, allowing to use a parent | |||
| # make -j value. This is useful to call OpenBLAS make from an other project | |||
| # makefile | |||
| # MAKE_NB_JOBS = 2 | |||
| # If you would like to know minute performance report of GotoBLAS. | |||
| @@ -1,4 +1,4 @@ | |||
| version: 0.2.15.{build} | |||
| version: 0.2.18.{build} | |||
| #environment: | |||
| @@ -33,6 +33,10 @@ LIBMKL = -L$(MKL) -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp -lpthread | |||
| # Apple vecLib | |||
| LIBVECLIB = -framework Accelerate | |||
| ESSL=/opt/ibm/lib | |||
| #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| LIBESSL = -lesslsmp $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a | |||
| ifeq ($(OSNAME), WINNT) | |||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| @@ -44,6 +48,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ | |||
| sger.goto dger.goto cger.goto zger.goto \ | |||
| sdot.goto ddot.goto \ | |||
| srot.goto drot.goto \ | |||
| saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ | |||
| scopy.goto dcopy.goto ccopy.goto zcopy.goto \ | |||
| sswap.goto dswap.goto cswap.goto zswap.goto \ | |||
| @@ -151,6 +156,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| ssyr2k.goto dsyr2k.goto csyr2k.goto zsyr2k.goto \ | |||
| sger.goto dger.goto cger.goto zger.goto \ | |||
| sdot.goto ddot.goto cdot.goto zdot.goto \ | |||
| srot.goto drot.goto \ | |||
| saxpy.goto daxpy.goto caxpy.goto zaxpy.goto \ | |||
| scopy.goto dcopy.goto ccopy.goto zcopy.goto \ | |||
| sswap.goto dswap.goto cswap.goto zswap.goto \ | |||
| @@ -253,7 +259,9 @@ mkl :: slinpack.mkl dlinpack.mkl clinpack.mkl zlinpack.mkl \ | |||
| endif | |||
| essl :: sgemm.essl strmm.essl dgemm.essl dtrmm.essl \ | |||
| cgemm.essl ctrmm.essl zgemm.essl ztrmm.essl \ | |||
| slinpack.essl clinpack.essl dlinpack.essl zlinpack.essl | |||
| veclib :: slinpack.veclib dlinpack.veclib clinpack.veclib zlinpack.veclib \ | |||
| scholesky.veclib dcholesky.veclib ccholesky.veclib zcholesky.veclib \ | |||
| @@ -306,6 +314,9 @@ slinpack.mkl : slinpack.$(SUFFIX) | |||
| slinpack.veclib : slinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| slinpack.essl : slinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dlinpack #################################################### | |||
| dlinpack.goto : dlinpack.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -322,6 +333,9 @@ dlinpack.mkl : dlinpack.$(SUFFIX) | |||
| dlinpack.veclib : dlinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dlinpack.essl : dlinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Clinpack #################################################### | |||
| clinpack.goto : clinpack.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -339,6 +353,9 @@ clinpack.mkl : clinpack.$(SUFFIX) | |||
| clinpack.veclib : clinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| clinpack.essl : clinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zlinpack #################################################### | |||
| zlinpack.goto : zlinpack.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -356,6 +373,9 @@ zlinpack.mkl : zlinpack.$(SUFFIX) | |||
| zlinpack.veclib : zlinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zlinpack.essl : zlinpack.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Scholesky ################################################### | |||
| scholesky.goto : scholesky.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -441,6 +461,9 @@ sgemm.mkl : sgemm.$(SUFFIX) | |||
| sgemm.veclib : sgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| sgemm.essl : sgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dgemm #################################################### | |||
| dgemm.goto : dgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -457,6 +480,9 @@ dgemm.mkl : dgemm.$(SUFFIX) | |||
| dgemm.veclib : dgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dgemm.essl : dgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Cgemm #################################################### | |||
| cgemm.goto : cgemm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -474,6 +500,9 @@ cgemm.mkl : cgemm.$(SUFFIX) | |||
| cgemm.veclib : cgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| cgemm.essl : cgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Zgemm #################################################### | |||
| zgemm.goto : zgemm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -491,6 +520,9 @@ zgemm.mkl : zgemm.$(SUFFIX) | |||
| zgemm.veclib : zgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| zgemm.essl : zgemm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ssymm #################################################### | |||
| ssymm.goto : ssymm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -573,6 +605,9 @@ strmm.mkl : strmm.$(SUFFIX) | |||
| strmm.veclib : strmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| strmm.essl : strmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Dtrmm #################################################### | |||
| dtrmm.goto : dtrmm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -589,6 +624,9 @@ dtrmm.mkl : dtrmm.$(SUFFIX) | |||
| dtrmm.veclib : dtrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| dtrmm.essl : dtrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ctrmm #################################################### | |||
| ctrmm.goto : ctrmm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -606,6 +644,9 @@ ctrmm.mkl : ctrmm.$(SUFFIX) | |||
| ctrmm.veclib : ctrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ctrmm.essl : ctrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Ztrmm #################################################### | |||
| ztrmm.goto : ztrmm.$(SUFFIX) ../$(LIBNAME) | |||
| @@ -623,6 +664,9 @@ ztrmm.mkl : ztrmm.$(SUFFIX) | |||
| ztrmm.veclib : ztrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ztrmm.essl : ztrmm.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Strsm #################################################### | |||
| strsm.goto : strsm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -1413,6 +1457,39 @@ zdot.mkl : zdot-intel.$(SUFFIX) | |||
| zdot.veclib : zdot-intel.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Srot #################################################### | |||
| srot.goto : srot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| srot.acml : srot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| srot.atlas : srot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| srot.mkl : srot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| srot.veclib : srot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Drot #################################################### | |||
| drot.goto : drot.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| drot.acml : drot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| drot.atlas : drot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBATLAS) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| drot.mkl : drot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBMKL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| drot.veclib : drot.$(SUFFIX) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(LIBVECLIB) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Saxpy #################################################### | |||
| saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -2124,6 +2201,13 @@ cgesv.$(SUFFIX) : gesv.c | |||
| zgesv.$(SUFFIX) : gesv.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| srot.$(SUFFIX) : rot.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| drot.$(SUFFIX) : rot.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| @@ -2137,7 +2221,7 @@ smallscaling: smallscaling.c ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(EXTRALIB) -fopenmp -lm | |||
| clean :: | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib | |||
| @rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl | |||
| include $(TOPDIR)/Makefile.tail | |||
| @@ -0,0 +1,197 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdio.h> | |||
| #include <stdlib.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #endif | |||
| #include "common.h" | |||
| #undef DOT | |||
| #ifdef DOUBLE | |||
| #define ROT BLASFUNC(drot) | |||
| #else | |||
| #define ROT BLASFUNC(srot) | |||
| #endif | |||
| #if defined(__WIN32__) || defined(__WIN64__) | |||
| #ifndef DELTA_EPOCH_IN_MICROSECS | |||
| #define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL | |||
| #endif | |||
| int gettimeofday(struct timeval *tv, void *tz){ | |||
| FILETIME ft; | |||
| unsigned __int64 tmpres = 0; | |||
| static int tzflag; | |||
| if (NULL != tv) | |||
| { | |||
| GetSystemTimeAsFileTime(&ft); | |||
| tmpres |= ft.dwHighDateTime; | |||
| tmpres <<= 32; | |||
| tmpres |= ft.dwLowDateTime; | |||
| /*converting file time to unix epoch*/ | |||
| tmpres /= 10; /*convert into microseconds*/ | |||
| tmpres -= DELTA_EPOCH_IN_MICROSECS; | |||
| tv->tv_sec = (long)(tmpres / 1000000UL); | |||
| tv->tv_usec = (long)(tmpres % 1000000UL); | |||
| } | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 | |||
| static void *huge_malloc(BLASLONG size){ | |||
| int shmid; | |||
| void *address; | |||
| #ifndef SHM_HUGETLB | |||
| #define SHM_HUGETLB 04000 | |||
| #endif | |||
| if ((shmid =shmget(IPC_PRIVATE, | |||
| (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), | |||
| SHM_HUGETLB | IPC_CREAT |0600)) < 0) { | |||
| printf( "Memory allocation failed(shmget).\n"); | |||
| exit(1); | |||
| } | |||
| address = shmat(shmid, NULL, SHM_RND); | |||
| if ((BLASLONG)address == -1){ | |||
| printf( "Memory allocation failed(shmat).\n"); | |||
| exit(1); | |||
| } | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| return address; | |||
| } | |||
| #define malloc huge_malloc | |||
| #endif | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *x, *y; | |||
| // FLOAT result; | |||
| blasint m, i; | |||
| blasint inc_x=1,inc_y=1; | |||
| FLOAT c[1] = { 2.0 }; | |||
| FLOAT s[1] = { 2.0 }; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| struct timeval start, stop; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| #ifdef linux | |||
| srandom(getpid()); | |||
| #endif | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| ROT (&m, x, &inc_x, y, &inc_y, c, s); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 6. * (double)m / timeg * 1.e-6); | |||
| } | |||
| return 0; | |||
| } | |||
| // void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); | |||
| @@ -798,7 +798,7 @@ Lmcount$lazy_ptr: | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #endif | |||
| @@ -62,7 +62,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #if defined(_MSC_VER) && !defined(__clang__) | |||
| // use intrinsic instead of inline assembly | |||
| ret = _InterlockedExchange(address, 1); | |||
| ret = _InterlockedExchange((volatile LONG *)address, 1); | |||
| // inline assembly | |||
| /*__asm { | |||
| mov eax, address | |||
| @@ -1452,6 +1452,31 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser | |||
| } | |||
| return TRUE; | |||
| } | |||
| /* | |||
| This is to allow static linking. | |||
| Code adapted from Google performance tools: | |||
| https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc | |||
| Reference: | |||
| https://sourceware.org/ml/pthreads-win32/2008/msg00028.html | |||
| http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp | |||
| */ | |||
| static int on_process_term(void) | |||
| { | |||
| gotoblas_quit(); | |||
| return 0; | |||
| } | |||
| #ifdef _WIN64 | |||
| #pragma comment(linker, "/INCLUDE:_tls_used") | |||
| #else | |||
| #pragma comment(linker, "/INCLUDE:__tls_used") | |||
| #endif | |||
| #pragma data_seg(push, old_seg) | |||
| #pragma data_seg(".CRT$XLB") | |||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #pragma data_seg(".CRT$XTU") | |||
| static int(*p_process_term)(void) = on_process_term; | |||
| #pragma data_seg(pop, old_seg) | |||
| #endif | |||
| #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) | |||
| @@ -1013,7 +1013,12 @@ int main(int argc, char *argv[]){ | |||
| #endif | |||
| #ifdef MAKE_NB_JOBS | |||
| #if MAKE_NB_JOBS > 0 | |||
| printf("MAKE += -j %d\n", MAKE_NB_JOBS); | |||
| #else | |||
| // Let make use parent -j argument or -j1 if there | |||
| // is no make parent | |||
| #endif | |||
| #elif NO_PARALLEL_MAKE==1 | |||
| printf("MAKE += -j 1\n"); | |||
| #else | |||
| @@ -64,10 +64,13 @@ int main(int argc, char **argv) { | |||
| if ((argc >= 2) && (*argv[1] == '1')) { | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) | |||
| printf("#define SLOCAL_BUFFER_SIZE\t%ld\n", (SGEMM_DEFAULT_Q * SGEMM_DEFAULT_UNROLL_N * 4 * 1 * sizeof(float))); | |||
| printf("#define DLOCAL_BUFFER_SIZE\t%ld\n", (DGEMM_DEFAULT_Q * DGEMM_DEFAULT_UNROLL_N * 2 * 1 * sizeof(double))); | |||
| printf("#define CLOCAL_BUFFER_SIZE\t%ld\n", (CGEMM_DEFAULT_Q * CGEMM_DEFAULT_UNROLL_N * 4 * 2 * sizeof(float))); | |||
| printf("#define ZLOCAL_BUFFER_SIZE\t%ld\n", (ZGEMM_DEFAULT_Q * ZGEMM_DEFAULT_UNROLL_N * 2 * 2 * sizeof(double))); | |||
| #endif | |||
| #ifdef USE64BITINT | |||
| printf("#define USE64BITINT\n"); | |||
| @@ -179,93 +179,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| fmul v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| fmul v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.4s, v2.4s, v9.4s[0] | |||
| fmls v19.4s, v2.4s, v9.s[0] | |||
| #else | |||
| fmul v19.4s, v2.4s, v9.4s[0] | |||
| fmul v19.4s, v2.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| fmul v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| fmul v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v23.16b, v23.16b, v23.16b | |||
| fmls v23.4s, v2.4s, v9.4s[1] | |||
| fmls v23.4s, v2.4s, v9.s[1] | |||
| #else | |||
| fmul v23.4s, v2.4s, v9.4s[1] | |||
| fmul v23.4s, v2.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| fmul v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| fmul v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v27.16b, v27.16b, v27.16b | |||
| fmls v27.4s, v2.4s, v9.4s[2] | |||
| fmls v27.4s, v2.4s, v9.s[2] | |||
| #else | |||
| fmul v27.4s, v2.4s, v9.4s[2] | |||
| fmul v27.4s, v2.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| fmul v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| fmul v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v31.16b, v31.16b, v31.16b | |||
| fmls v31.4s, v2.4s, v9.4s[3] | |||
| fmls v31.4s, v2.4s, v9.s[3] | |||
| #else | |||
| fmul v31.4s, v2.4s, v9.4s[3] | |||
| fmul v31.4s, v2.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -276,159 +276,159 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| ld2 {v12.4s, v13.4s}, [pB] // for next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| ld2 {v4.4s, v5.4s} , [pA] // for next round | |||
| add pA, pA, #32 | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| ld2 {v6.4s, v7.4s} , [ppA] // for next round | |||
| add ppA, ppA, #32 | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| prfm PLDL1KEEP, [ppA, #512] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| ld2 {v8.4s, v9.4s}, [pB] // for next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| ld2 {v0.4s, v1.4s}, [pA] // for next round | |||
| add pA, pA, #32 | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| prfm PLDL1KEEP, [ppA, #512] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| ld2 {v2.4s, v3.4s}, [ppA] // for next round | |||
| add ppA, ppA, #32 | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -437,48 +437,48 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| ld2 {v2.4s, v3.4s}, [ppA] | |||
| add ppA, ppA, #32 | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -578,25 +578,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -658,25 +658,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.4s[0] | |||
| OP_ii v16.2s, v1.2s, v9.4s[0] | |||
| OP_ri v17.2s, v0.2s, v9.4s[0] | |||
| OP_ir v17.2s, v1.2s, v8.4s[0] | |||
| OP_rr v20.2s, v0.2s, v8.4s[1] | |||
| OP_ii v20.2s, v1.2s, v9.4s[1] | |||
| OP_ri v21.2s, v0.2s, v9.4s[1] | |||
| OP_ir v21.2s, v1.2s, v8.4s[1] | |||
| OP_rr v24.2s, v0.2s, v8.4s[2] | |||
| OP_ii v24.2s, v1.2s, v9.4s[2] | |||
| OP_ri v25.2s, v0.2s, v9.4s[2] | |||
| OP_ir v25.2s, v1.2s, v8.4s[2] | |||
| OP_rr v28.2s, v0.2s, v8.4s[3] | |||
| OP_ii v28.2s, v1.2s, v9.4s[3] | |||
| OP_ri v29.2s, v0.2s, v9.4s[3] | |||
| OP_ir v29.2s, v1.2s, v8.4s[3] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| OP_rr v24.2s, v0.2s, v8.s[2] | |||
| OP_ii v24.2s, v1.2s, v9.s[2] | |||
| OP_ri v25.2s, v0.2s, v9.s[2] | |||
| OP_ir v25.2s, v1.2s, v8.s[2] | |||
| OP_rr v28.2s, v0.2s, v8.s[3] | |||
| OP_ii v28.2s, v1.2s, v9.s[3] | |||
| OP_ri v29.2s, v0.2s, v9.s[3] | |||
| OP_ir v29.2s, v1.2s, v8.s[3] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -738,25 +738,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.4s[0] | |||
| OP_ii s16, s1, v9.4s[0] | |||
| OP_ri s17, s0, v9.4s[0] | |||
| OP_ir s17, s1, v8.4s[0] | |||
| OP_rr s20, s0, v8.4s[1] | |||
| OP_ii s20, s1, v9.4s[1] | |||
| OP_ri s21, s0, v9.4s[1] | |||
| OP_ir s21, s1, v8.4s[1] | |||
| OP_rr s24, s0, v8.4s[2] | |||
| OP_ii s24, s1, v9.4s[2] | |||
| OP_ri s25, s0, v9.4s[2] | |||
| OP_ir s25, s1, v8.4s[2] | |||
| OP_rr s28, s0, v8.4s[3] | |||
| OP_ii s28, s1, v9.4s[3] | |||
| OP_ri s29, s0, v9.4s[3] | |||
| OP_ir s29, s1, v8.4s[3] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| OP_rr s24, s0, v8.s[2] | |||
| OP_ii s24, s1, v9.s[2] | |||
| OP_ri s25, s0, v9.s[2] | |||
| OP_ir s25, s1, v8.s[2] | |||
| OP_rr s28, s0, v8.s[3] | |||
| OP_ii s28, s1, v9.s[3] | |||
| OP_ri s29, s0, v9.s[3] | |||
| OP_ir s29, s1, v8.s[3] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -814,15 +814,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -862,15 +862,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.2s[0] | |||
| OP_ii v16.2s, v1.2s, v9.2s[0] | |||
| OP_ri v17.2s, v0.2s, v9.2s[0] | |||
| OP_ir v17.2s, v1.2s, v8.2s[0] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.2s[1] | |||
| OP_ii v20.2s, v1.2s, v9.2s[1] | |||
| OP_ri v21.2s, v0.2s, v9.2s[1] | |||
| OP_ir v21.2s, v1.2s, v8.2s[1] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -910,15 +910,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.2s[0] | |||
| OP_ii s16, s1, v9.2s[0] | |||
| OP_ri s17, s0, v9.2s[0] | |||
| OP_ir s17, s1, v8.2s[0] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.2s[1] | |||
| OP_ii s20, s1, v9.2s[1] | |||
| OP_ri s21, s0, v9.2s[1] | |||
| OP_ir s21, s1, v8.2s[1] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -178,93 +178,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| fmul v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.4s, v2.4s, v9.4s[0] | |||
| fmls v19.4s, v2.4s, v9.s[0] | |||
| #else | |||
| fmul v19.4s, v2.4s, v9.4s[0] | |||
| fmul v19.4s, v2.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| fmul v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v23.16b, v23.16b, v23.16b | |||
| fmls v23.4s, v2.4s, v9.4s[1] | |||
| fmls v23.4s, v2.4s, v9.s[1] | |||
| #else | |||
| fmul v23.4s, v2.4s, v9.4s[1] | |||
| fmul v23.4s, v2.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| fmul v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v27.16b, v27.16b, v27.16b | |||
| fmls v27.4s, v2.4s, v9.4s[2] | |||
| fmls v27.4s, v2.4s, v9.s[2] | |||
| #else | |||
| fmul v27.4s, v2.4s, v9.4s[2] | |||
| fmul v27.4s, v2.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| fmul v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| fmul v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v31.16b, v31.16b, v31.16b | |||
| fmls v31.4s, v2.4s, v9.4s[3] | |||
| fmls v31.4s, v2.4s, v9.s[3] | |||
| #else | |||
| fmul v31.4s, v2.4s, v9.4s[3] | |||
| fmul v31.4s, v2.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -275,45 +275,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -324,45 +324,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| ld2 {v8.4s, v9.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -373,45 +373,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| .endm | |||
| @@ -423,45 +423,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| .endm | |||
| @@ -560,49 +560,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -611,85 +611,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| ld2 {v12.4s, v13.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| ld2 {v4.4s, v5.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| ld2 {v8.4s, v9.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| ld2 {v0.4s, v1.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -698,25 +698,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -778,25 +778,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.4s[0] | |||
| OP_ii v16.2s, v1.2s, v9.4s[0] | |||
| OP_ri v17.2s, v0.2s, v9.4s[0] | |||
| OP_ir v17.2s, v1.2s, v8.4s[0] | |||
| OP_rr v20.2s, v0.2s, v8.4s[1] | |||
| OP_ii v20.2s, v1.2s, v9.4s[1] | |||
| OP_ri v21.2s, v0.2s, v9.4s[1] | |||
| OP_ir v21.2s, v1.2s, v8.4s[1] | |||
| OP_rr v24.2s, v0.2s, v8.4s[2] | |||
| OP_ii v24.2s, v1.2s, v9.4s[2] | |||
| OP_ri v25.2s, v0.2s, v9.4s[2] | |||
| OP_ir v25.2s, v1.2s, v8.4s[2] | |||
| OP_rr v28.2s, v0.2s, v8.4s[3] | |||
| OP_ii v28.2s, v1.2s, v9.4s[3] | |||
| OP_ri v29.2s, v0.2s, v9.4s[3] | |||
| OP_ir v29.2s, v1.2s, v8.4s[3] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| OP_rr v24.2s, v0.2s, v8.s[2] | |||
| OP_ii v24.2s, v1.2s, v9.s[2] | |||
| OP_ri v25.2s, v0.2s, v9.s[2] | |||
| OP_ir v25.2s, v1.2s, v8.s[2] | |||
| OP_rr v28.2s, v0.2s, v8.s[3] | |||
| OP_ii v28.2s, v1.2s, v9.s[3] | |||
| OP_ri v29.2s, v0.2s, v9.s[3] | |||
| OP_ir v29.2s, v1.2s, v8.s[3] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -858,25 +858,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.4s[0] | |||
| OP_ii s16, s1, v9.4s[0] | |||
| OP_ri s17, s0, v9.4s[0] | |||
| OP_ir s17, s1, v8.4s[0] | |||
| OP_rr s20, s0, v8.4s[1] | |||
| OP_ii s20, s1, v9.4s[1] | |||
| OP_ri s21, s0, v9.4s[1] | |||
| OP_ir s21, s1, v8.4s[1] | |||
| OP_rr s24, s0, v8.4s[2] | |||
| OP_ii s24, s1, v9.4s[2] | |||
| OP_ri s25, s0, v9.4s[2] | |||
| OP_ir s25, s1, v8.4s[2] | |||
| OP_rr s28, s0, v8.4s[3] | |||
| OP_ii s28, s1, v9.4s[3] | |||
| OP_ri s29, s0, v9.4s[3] | |||
| OP_ir s29, s1, v8.4s[3] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| OP_rr s24, s0, v8.s[2] | |||
| OP_ii s24, s1, v9.s[2] | |||
| OP_ri s25, s0, v9.s[2] | |||
| OP_ir s25, s1, v8.s[2] | |||
| OP_rr s28, s0, v8.s[3] | |||
| OP_ii s28, s1, v9.s[3] | |||
| OP_ri s29, s0, v9.s[3] | |||
| OP_ir s29, s1, v8.s[3] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -940,25 +940,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v18.4s, v2.4s, v8.2s[0] | |||
| OP_ii v18.4s, v3.4s, v9.2s[0] | |||
| OP_ri v19.4s, v2.4s, v9.2s[0] | |||
| OP_ir v19.4s, v3.4s, v8.2s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v22.4s, v2.4s, v8.2s[1] | |||
| OP_ii v22.4s, v3.4s, v9.2s[1] | |||
| OP_ri v23.4s, v2.4s, v9.2s[1] | |||
| OP_ir v23.4s, v3.4s, v8.2s[1] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -1016,15 +1016,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -1064,15 +1064,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.2s[0] | |||
| OP_ii v16.2s, v1.2s, v9.2s[0] | |||
| OP_ri v17.2s, v0.2s, v9.2s[0] | |||
| OP_ir v17.2s, v1.2s, v8.2s[0] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.2s[1] | |||
| OP_ii v20.2s, v1.2s, v9.2s[1] | |||
| OP_ri v21.2s, v0.2s, v9.2s[1] | |||
| OP_ir v21.2s, v1.2s, v8.2s[1] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -1112,15 +1112,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.2s[0] | |||
| OP_ii s16, s1, v9.2s[0] | |||
| OP_ri s17, s0, v9.2s[0] | |||
| OP_ir s17, s1, v8.2s[0] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.2s[1] | |||
| OP_ii s20, s1, v9.2s[1] | |||
| OP_ri s21, s0, v9.2s[1] | |||
| OP_ir s21, s1, v8.2s[1] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -1162,15 +1162,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v8.4s[1] | |||
| OP_ri v17.4s, v0.4s, v8.4s[1] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v8.s[1] | |||
| OP_ri v17.4s, v0.4s, v8.s[1] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v8.4s[1] | |||
| OP_ri v19.4s, v2.4s, v8.4s[1] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v8.s[1] | |||
| OP_ri v19.4s, v2.4s, v8.s[1] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -170,49 +170,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -221,85 +221,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| ld2 {v12.4s, v13.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| ld2 {v4.4s, v5.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| ld2 {v8.4s, v9.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| ld2 {v0.4s, v1.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -308,25 +308,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -384,25 +384,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.4s[0] | |||
| OP_ii v16.2s, v1.2s, v9.4s[0] | |||
| OP_ri v17.2s, v0.2s, v9.4s[0] | |||
| OP_ir v17.2s, v1.2s, v8.4s[0] | |||
| OP_rr v20.2s, v0.2s, v8.4s[1] | |||
| OP_ii v20.2s, v1.2s, v9.4s[1] | |||
| OP_ri v21.2s, v0.2s, v9.4s[1] | |||
| OP_ir v21.2s, v1.2s, v8.4s[1] | |||
| OP_rr v24.2s, v0.2s, v8.4s[2] | |||
| OP_ii v24.2s, v1.2s, v9.4s[2] | |||
| OP_ri v25.2s, v0.2s, v9.4s[2] | |||
| OP_ir v25.2s, v1.2s, v8.4s[2] | |||
| OP_rr v28.2s, v0.2s, v8.4s[3] | |||
| OP_ii v28.2s, v1.2s, v9.4s[3] | |||
| OP_ri v29.2s, v0.2s, v9.4s[3] | |||
| OP_ir v29.2s, v1.2s, v8.4s[3] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| OP_rr v24.2s, v0.2s, v8.s[2] | |||
| OP_ii v24.2s, v1.2s, v9.s[2] | |||
| OP_ri v25.2s, v0.2s, v9.s[2] | |||
| OP_ir v25.2s, v1.2s, v8.s[2] | |||
| OP_rr v28.2s, v0.2s, v8.s[3] | |||
| OP_ii v28.2s, v1.2s, v9.s[3] | |||
| OP_ri v29.2s, v0.2s, v9.s[3] | |||
| OP_ir v29.2s, v1.2s, v8.s[3] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -460,25 +460,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.4s[0] | |||
| OP_ii s16, s1, v9.4s[0] | |||
| OP_ri s17, s0, v9.4s[0] | |||
| OP_ir s17, s1, v8.4s[0] | |||
| OP_rr s20, s0, v8.4s[1] | |||
| OP_ii s20, s1, v9.4s[1] | |||
| OP_ri s21, s0, v9.4s[1] | |||
| OP_ir s21, s1, v8.4s[1] | |||
| OP_rr s24, s0, v8.4s[2] | |||
| OP_ii s24, s1, v9.4s[2] | |||
| OP_ri s25, s0, v9.4s[2] | |||
| OP_ir s25, s1, v8.4s[2] | |||
| OP_rr s28, s0, v8.4s[3] | |||
| OP_ii s28, s1, v9.4s[3] | |||
| OP_ri s29, s0, v9.4s[3] | |||
| OP_ir s29, s1, v8.4s[3] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| OP_rr s24, s0, v8.s[2] | |||
| OP_ii s24, s1, v9.s[2] | |||
| OP_ri s25, s0, v9.s[2] | |||
| OP_ir s25, s1, v8.s[2] | |||
| OP_rr s28, s0, v8.s[3] | |||
| OP_ii s28, s1, v9.s[3] | |||
| OP_ri s29, s0, v9.s[3] | |||
| OP_ir s29, s1, v8.s[3] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -532,15 +532,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -578,15 +578,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.2s[0] | |||
| OP_ii v16.2s, v1.2s, v9.2s[0] | |||
| OP_ri v17.2s, v0.2s, v9.2s[0] | |||
| OP_ir v17.2s, v1.2s, v8.2s[0] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.2s[1] | |||
| OP_ii v20.2s, v1.2s, v9.2s[1] | |||
| OP_ri v21.2s, v0.2s, v9.2s[1] | |||
| OP_ir v21.2s, v1.2s, v8.2s[1] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -624,15 +624,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.2s[0] | |||
| OP_ii s16, s1, v9.2s[0] | |||
| OP_ri s17, s0, v9.2s[0] | |||
| OP_ir s17, s1, v8.2s[0] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.2s[1] | |||
| OP_ii s20, s1, v9.2s[1] | |||
| OP_ri s21, s0, v9.2s[1] | |||
| OP_ir s21, s1, v8.2s[1] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -180,93 +180,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| fmul v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.4s, v2.4s, v9.4s[0] | |||
| fmls v19.4s, v2.4s, v9.s[0] | |||
| #else | |||
| fmul v19.4s, v2.4s, v9.4s[0] | |||
| fmul v19.4s, v2.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| fmul v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v23.16b, v23.16b, v23.16b | |||
| fmls v23.4s, v2.4s, v9.4s[1] | |||
| fmls v23.4s, v2.4s, v9.s[1] | |||
| #else | |||
| fmul v23.4s, v2.4s, v9.4s[1] | |||
| fmul v23.4s, v2.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| fmul v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v27.16b, v27.16b, v27.16b | |||
| fmls v27.4s, v2.4s, v9.4s[2] | |||
| fmls v27.4s, v2.4s, v9.s[2] | |||
| #else | |||
| fmul v27.4s, v2.4s, v9.4s[2] | |||
| fmul v27.4s, v2.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| fmul v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| fmul v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v31.16b, v31.16b, v31.16b | |||
| fmls v31.4s, v2.4s, v9.4s[3] | |||
| fmls v31.4s, v2.4s, v9.s[3] | |||
| #else | |||
| fmul v31.4s, v2.4s, v9.4s[3] | |||
| fmul v31.4s, v2.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -277,45 +277,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -326,45 +326,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| ld2 {v8.4s, v9.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -375,45 +375,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v18.4s, v6.4s, v12.4s[0] | |||
| OP_ii v18.4s, v7.4s, v13.4s[0] | |||
| OP_ri v19.4s, v6.4s, v13.4s[0] | |||
| OP_ir v19.4s, v7.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v22.4s, v6.4s, v12.4s[1] | |||
| OP_ii v22.4s, v7.4s, v13.4s[1] | |||
| OP_ri v23.4s, v6.4s, v13.4s[1] | |||
| OP_ir v23.4s, v7.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v26.4s, v6.4s, v12.4s[2] | |||
| OP_ii v26.4s, v7.4s, v13.4s[2] | |||
| OP_ri v27.4s, v6.4s, v13.4s[2] | |||
| OP_ir v27.4s, v7.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v30.4s, v6.4s, v12.4s[3] | |||
| OP_ii v30.4s, v7.4s, v13.4s[3] | |||
| OP_ri v31.4s, v6.4s, v13.4s[3] | |||
| OP_ir v31.4s, v7.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v18.4s, v6.4s, v12.s[0] | |||
| OP_ii v18.4s, v7.4s, v13.s[0] | |||
| OP_ri v19.4s, v6.4s, v13.s[0] | |||
| OP_ir v19.4s, v7.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v22.4s, v6.4s, v12.s[1] | |||
| OP_ii v22.4s, v7.4s, v13.s[1] | |||
| OP_ri v23.4s, v6.4s, v13.s[1] | |||
| OP_ir v23.4s, v7.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v26.4s, v6.4s, v12.s[2] | |||
| OP_ii v26.4s, v7.4s, v13.s[2] | |||
| OP_ri v27.4s, v6.4s, v13.s[2] | |||
| OP_ir v27.4s, v7.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| OP_rr v30.4s, v6.4s, v12.s[3] | |||
| OP_ii v30.4s, v7.4s, v13.s[3] | |||
| OP_ri v31.4s, v6.4s, v13.s[3] | |||
| OP_ir v31.4s, v7.4s, v12.s[3] | |||
| .endm | |||
| @@ -425,45 +425,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v9.4s[0] | |||
| OP_ri v19.4s, v2.4s, v9.4s[0] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v22.4s, v2.4s, v8.4s[1] | |||
| OP_ii v22.4s, v3.4s, v9.4s[1] | |||
| OP_ri v23.4s, v2.4s, v9.4s[1] | |||
| OP_ir v23.4s, v3.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v26.4s, v2.4s, v8.4s[2] | |||
| OP_ii v26.4s, v3.4s, v9.4s[2] | |||
| OP_ri v27.4s, v2.4s, v9.4s[2] | |||
| OP_ir v27.4s, v3.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v30.4s, v2.4s, v8.4s[3] | |||
| OP_ii v30.4s, v3.4s, v9.4s[3] | |||
| OP_ri v31.4s, v2.4s, v9.4s[3] | |||
| OP_ir v31.4s, v3.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v26.4s, v2.4s, v8.s[2] | |||
| OP_ii v26.4s, v3.4s, v9.s[2] | |||
| OP_ri v27.4s, v2.4s, v9.s[2] | |||
| OP_ir v27.4s, v3.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| OP_rr v30.4s, v2.4s, v8.s[3] | |||
| OP_ii v30.4s, v3.4s, v9.s[3] | |||
| OP_ri v31.4s, v2.4s, v9.s[3] | |||
| OP_ir v31.4s, v3.4s, v8.s[3] | |||
| .endm | |||
| @@ -562,49 +562,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.4s, v0.4s, v9.4s[0] | |||
| fmls v17.4s, v0.4s, v9.s[0] | |||
| #else | |||
| fmul v17.4s, v0.4s, v9.4s[0] | |||
| fmul v17.4s, v0.4s, v9.s[0] | |||
| #endif | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.4s, v0.4s, v9.4s[1] | |||
| fmls v21.4s, v0.4s, v9.s[1] | |||
| #else | |||
| fmul v21.4s, v0.4s, v9.4s[1] | |||
| fmul v21.4s, v0.4s, v9.s[1] | |||
| #endif | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.4s, v0.4s, v9.4s[2] | |||
| fmls v25.4s, v0.4s, v9.s[2] | |||
| #else | |||
| fmul v25.4s, v0.4s, v9.4s[2] | |||
| fmul v25.4s, v0.4s, v9.s[2] | |||
| #endif | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.4s, v0.4s, v9.4s[3] | |||
| fmls v29.4s, v0.4s, v9.s[3] | |||
| #else | |||
| fmul v29.4s, v0.4s, v9.4s[3] | |||
| fmul v29.4s, v0.4s, v9.s[3] | |||
| #endif | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| ld2 {v12.4s, v13.4s}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -613,85 +613,85 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| ld2 {v12.4s, v13.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| ld2 {v4.4s, v5.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| ld2 {v8.4s, v9.4s}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| ld2 {v0.4s, v1.4s}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| OP_rr v16.4s, v4.4s, v12.4s[0] | |||
| OP_ii v16.4s, v5.4s, v13.4s[0] | |||
| OP_ri v17.4s, v4.4s, v13.4s[0] | |||
| OP_ir v17.4s, v5.4s, v12.4s[0] | |||
| OP_rr v20.4s, v4.4s, v12.4s[1] | |||
| OP_ii v20.4s, v5.4s, v13.4s[1] | |||
| OP_ri v21.4s, v4.4s, v13.4s[1] | |||
| OP_ir v21.4s, v5.4s, v12.4s[1] | |||
| OP_rr v24.4s, v4.4s, v12.4s[2] | |||
| OP_ii v24.4s, v5.4s, v13.4s[2] | |||
| OP_ri v25.4s, v4.4s, v13.4s[2] | |||
| OP_ir v25.4s, v5.4s, v12.4s[2] | |||
| OP_rr v28.4s, v4.4s, v12.4s[3] | |||
| OP_ii v28.4s, v5.4s, v13.4s[3] | |||
| OP_ri v29.4s, v4.4s, v13.4s[3] | |||
| OP_ir v29.4s, v5.4s, v12.4s[3] | |||
| OP_rr v16.4s, v4.4s, v12.s[0] | |||
| OP_ii v16.4s, v5.4s, v13.s[0] | |||
| OP_ri v17.4s, v4.4s, v13.s[0] | |||
| OP_ir v17.4s, v5.4s, v12.s[0] | |||
| OP_rr v20.4s, v4.4s, v12.s[1] | |||
| OP_ii v20.4s, v5.4s, v13.s[1] | |||
| OP_ri v21.4s, v4.4s, v13.s[1] | |||
| OP_ir v21.4s, v5.4s, v12.s[1] | |||
| OP_rr v24.4s, v4.4s, v12.s[2] | |||
| OP_ii v24.4s, v5.4s, v13.s[2] | |||
| OP_ri v25.4s, v4.4s, v13.s[2] | |||
| OP_ir v25.4s, v5.4s, v12.s[2] | |||
| OP_rr v28.4s, v4.4s, v12.s[3] | |||
| OP_ii v28.4s, v5.4s, v13.s[3] | |||
| OP_ri v29.4s, v4.4s, v13.s[3] | |||
| OP_ir v29.4s, v5.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -700,25 +700,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v9.4s[0] | |||
| OP_ri v17.4s, v0.4s, v9.4s[0] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v20.4s, v0.4s, v8.4s[1] | |||
| OP_ii v20.4s, v1.4s, v9.4s[1] | |||
| OP_ri v21.4s, v0.4s, v9.4s[1] | |||
| OP_ir v21.4s, v1.4s, v8.4s[1] | |||
| OP_rr v24.4s, v0.4s, v8.4s[2] | |||
| OP_ii v24.4s, v1.4s, v9.4s[2] | |||
| OP_ri v25.4s, v0.4s, v9.4s[2] | |||
| OP_ir v25.4s, v1.4s, v8.4s[2] | |||
| OP_rr v28.4s, v0.4s, v8.4s[3] | |||
| OP_ii v28.4s, v1.4s, v9.4s[3] | |||
| OP_ri v29.4s, v0.4s, v9.4s[3] | |||
| OP_ir v29.4s, v1.4s, v8.4s[3] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v24.4s, v0.4s, v8.s[2] | |||
| OP_ii v24.4s, v1.4s, v9.s[2] | |||
| OP_ri v25.4s, v0.4s, v9.s[2] | |||
| OP_ir v25.4s, v1.4s, v8.s[2] | |||
| OP_rr v28.4s, v0.4s, v8.s[3] | |||
| OP_ii v28.4s, v1.4s, v9.s[3] | |||
| OP_ri v29.4s, v0.4s, v9.s[3] | |||
| OP_ir v29.4s, v1.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -780,25 +780,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.4s[0] | |||
| OP_ii v16.2s, v1.2s, v9.4s[0] | |||
| OP_ri v17.2s, v0.2s, v9.4s[0] | |||
| OP_ir v17.2s, v1.2s, v8.4s[0] | |||
| OP_rr v20.2s, v0.2s, v8.4s[1] | |||
| OP_ii v20.2s, v1.2s, v9.4s[1] | |||
| OP_ri v21.2s, v0.2s, v9.4s[1] | |||
| OP_ir v21.2s, v1.2s, v8.4s[1] | |||
| OP_rr v24.2s, v0.2s, v8.4s[2] | |||
| OP_ii v24.2s, v1.2s, v9.4s[2] | |||
| OP_ri v25.2s, v0.2s, v9.4s[2] | |||
| OP_ir v25.2s, v1.2s, v8.4s[2] | |||
| OP_rr v28.2s, v0.2s, v8.4s[3] | |||
| OP_ii v28.2s, v1.2s, v9.4s[3] | |||
| OP_ri v29.2s, v0.2s, v9.4s[3] | |||
| OP_ir v29.2s, v1.2s, v8.4s[3] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| OP_rr v24.2s, v0.2s, v8.s[2] | |||
| OP_ii v24.2s, v1.2s, v9.s[2] | |||
| OP_ri v25.2s, v0.2s, v9.s[2] | |||
| OP_ir v25.2s, v1.2s, v8.s[2] | |||
| OP_rr v28.2s, v0.2s, v8.s[3] | |||
| OP_ii v28.2s, v1.2s, v9.s[3] | |||
| OP_ri v29.2s, v0.2s, v9.s[3] | |||
| OP_ir v29.2s, v1.2s, v8.s[3] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -860,25 +860,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.4s[0] | |||
| OP_ii s16, s1, v9.4s[0] | |||
| OP_ri s17, s0, v9.4s[0] | |||
| OP_ir s17, s1, v8.4s[0] | |||
| OP_rr s20, s0, v8.4s[1] | |||
| OP_ii s20, s1, v9.4s[1] | |||
| OP_ri s21, s0, v9.4s[1] | |||
| OP_ir s21, s1, v8.4s[1] | |||
| OP_rr s24, s0, v8.4s[2] | |||
| OP_ii s24, s1, v9.4s[2] | |||
| OP_ri s25, s0, v9.4s[2] | |||
| OP_ir s25, s1, v8.4s[2] | |||
| OP_rr s28, s0, v8.4s[3] | |||
| OP_ii s28, s1, v9.4s[3] | |||
| OP_ri s29, s0, v9.4s[3] | |||
| OP_ir s29, s1, v8.4s[3] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| OP_rr s24, s0, v8.s[2] | |||
| OP_ii s24, s1, v9.s[2] | |||
| OP_ri s25, s0, v9.s[2] | |||
| OP_ir s25, s1, v8.s[2] | |||
| OP_rr s28, s0, v8.s[3] | |||
| OP_ii s28, s1, v9.s[3] | |||
| OP_ri s29, s0, v9.s[3] | |||
| OP_ir s29, s1, v8.s[3] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -942,25 +942,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v18.4s, v2.4s, v8.2s[0] | |||
| OP_ii v18.4s, v3.4s, v9.2s[0] | |||
| OP_ri v19.4s, v2.4s, v9.2s[0] | |||
| OP_ir v19.4s, v3.4s, v8.2s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v22.4s, v2.4s, v8.2s[1] | |||
| OP_ii v22.4s, v3.4s, v9.2s[1] | |||
| OP_ri v23.4s, v2.4s, v9.2s[1] | |||
| OP_ir v23.4s, v3.4s, v8.2s[1] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v9.s[0] | |||
| OP_ri v19.4s, v2.4s, v9.s[0] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| OP_rr v22.4s, v2.4s, v8.s[1] | |||
| OP_ii v22.4s, v3.4s, v9.s[1] | |||
| OP_ri v23.4s, v2.4s, v9.s[1] | |||
| OP_ir v23.4s, v3.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -1018,15 +1018,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.4s, v1.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.2s[0] | |||
| OP_ii v16.4s, v1.4s, v9.2s[0] | |||
| OP_ri v17.4s, v0.4s, v9.2s[0] | |||
| OP_ir v17.4s, v1.4s, v8.2s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v9.s[0] | |||
| OP_ri v17.4s, v0.4s, v9.s[0] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v20.4s, v0.4s, v8.2s[1] | |||
| OP_ii v20.4s, v1.4s, v9.2s[1] | |||
| OP_ri v21.4s, v0.4s, v9.2s[1] | |||
| OP_ir v21.4s, v1.4s, v8.2s[1] | |||
| OP_rr v20.4s, v0.4s, v8.s[1] | |||
| OP_ii v20.4s, v1.4s, v9.s[1] | |||
| OP_ri v21.4s, v0.4s, v9.s[1] | |||
| OP_ir v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -1066,15 +1066,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| OP_rr v16.2s, v0.2s, v8.2s[0] | |||
| OP_ii v16.2s, v1.2s, v9.2s[0] | |||
| OP_ri v17.2s, v0.2s, v9.2s[0] | |||
| OP_ir v17.2s, v1.2s, v8.2s[0] | |||
| OP_rr v16.2s, v0.2s, v8.s[0] | |||
| OP_ii v16.2s, v1.2s, v9.s[0] | |||
| OP_ri v17.2s, v0.2s, v9.s[0] | |||
| OP_ir v17.2s, v1.2s, v8.s[0] | |||
| OP_rr v20.2s, v0.2s, v8.2s[1] | |||
| OP_ii v20.2s, v1.2s, v9.2s[1] | |||
| OP_ri v21.2s, v0.2s, v9.2s[1] | |||
| OP_ir v21.2s, v1.2s, v8.2s[1] | |||
| OP_rr v20.2s, v0.2s, v8.s[1] | |||
| OP_ii v20.2s, v1.2s, v9.s[1] | |||
| OP_ri v21.2s, v0.2s, v9.s[1] | |||
| OP_ir v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -1114,15 +1114,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.s, v1.s}[0], [pA] | |||
| add pA, pA, #8 | |||
| OP_rr s16, s0, v8.2s[0] | |||
| OP_ii s16, s1, v9.2s[0] | |||
| OP_ri s17, s0, v9.2s[0] | |||
| OP_ir s17, s1, v8.2s[0] | |||
| OP_rr s16, s0, v8.s[0] | |||
| OP_ii s16, s1, v9.s[0] | |||
| OP_ri s17, s0, v9.s[0] | |||
| OP_ir s17, s1, v8.s[0] | |||
| OP_rr s20, s0, v8.2s[1] | |||
| OP_ii s20, s1, v9.2s[1] | |||
| OP_ri s21, s0, v9.2s[1] | |||
| OP_ir s21, s1, v8.2s[1] | |||
| OP_rr s20, s0, v8.s[1] | |||
| OP_ii s20, s1, v9.s[1] | |||
| OP_ri s21, s0, v9.s[1] | |||
| OP_ir s21, s1, v8.s[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -1164,15 +1164,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.4s, v3.4s}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.4s, v0.4s, v8.4s[0] | |||
| OP_ii v16.4s, v1.4s, v8.4s[1] | |||
| OP_ri v17.4s, v0.4s, v8.4s[1] | |||
| OP_ir v17.4s, v1.4s, v8.4s[0] | |||
| OP_rr v16.4s, v0.4s, v8.s[0] | |||
| OP_ii v16.4s, v1.4s, v8.s[1] | |||
| OP_ri v17.4s, v0.4s, v8.s[1] | |||
| OP_ir v17.4s, v1.4s, v8.s[0] | |||
| OP_rr v18.4s, v2.4s, v8.4s[0] | |||
| OP_ii v18.4s, v3.4s, v8.4s[1] | |||
| OP_ri v19.4s, v2.4s, v8.4s[1] | |||
| OP_ir v19.4s, v3.4s, v8.4s[0] | |||
| OP_rr v18.4s, v2.4s, v8.s[0] | |||
| OP_ii v18.4s, v3.4s, v8.s[1] | |||
| OP_ri v19.4s, v2.4s, v8.s[1] | |||
| OP_ir v19.4s, v3.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -161,150 +161,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v31.2d, v3.2d, v11.2d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v31.2d, v3.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] | |||
| fmul v22.2d, v2.2d, v9.2d[0] | |||
| fmul v27.2d, v3.2d, v10.2d[0] | |||
| fmul v22.2d, v2.2d, v9.d[0] | |||
| fmul v27.2d, v3.2d, v10.d[0] | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| ldp q4, q5, [pA] // for next round | |||
| add pA, pA, #32 | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| fmul v23.2d, v3.2d, v9.2d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| fmul v23.2d, v3.2d, v9.d[0] | |||
| ldp q6, q7, [ppA] // for next round | |||
| add ppA, ppA, #32 | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| fmul v30.2d, v2.2d, v11.2d[0] | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| fmul v30.2d, v2.2d, v11.d[0] | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [ppA, #A_PRE_SIZE] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| ldp q4, q5, [pA] | |||
| add pA, pA, #32 | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| ldp q6, q7, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -315,28 +315,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldp q0, q1, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| ldp q2, q3, [ppA] | |||
| add ppA, ppA, #32 | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -422,17 +422,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -482,10 +482,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -572,10 +572,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -610,8 +610,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -643,7 +643,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -674,8 +674,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -705,7 +705,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -154,25 +154,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v18.2d, v0.2d, v8.2d[1] | |||
| fmul v19.2d, v1.2d, v8.2d[1] | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| fmul v22.2d, v0.2d, v9.2d[1] | |||
| fmul v23.2d, v1.2d, v9.2d[1] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| fmul v26.2d, v0.2d, v10.2d[1] | |||
| fmul v27.2d, v1.2d, v10.2d[1] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| fmul v30.2d, v0.2d, v11.2d[1] | |||
| fmul v31.2d, v1.2d, v11.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v0.2d, v8.d[1] | |||
| fmul v19.2d, v1.2d, v8.d[1] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| fmul v22.2d, v0.2d, v9.d[1] | |||
| fmul v23.2d, v1.2d, v9.d[1] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v26.2d, v0.2d, v10.d[1] | |||
| fmul v27.2d, v1.2d, v10.d[1] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| fmul v30.2d, v0.2d, v11.d[1] | |||
| fmul v31.2d, v1.2d, v11.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -183,25 +183,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v19.2d, v1.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v23.2d, v1.2d, v9.2d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v27.2d, v1.2d, v10.2d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v31.2d, v1.2d, v11.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v19.2d, v1.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v23.2d, v1.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v27.2d, v1.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| fmla v31.2d, v1.2d, v11.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -214,25 +214,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v4.2d, v12.2d[1] | |||
| fmla v19.2d, v5.2d, v12.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v4.2d, v13.2d[1] | |||
| fmla v23.2d, v5.2d, v13.2d[1] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v4.2d, v14.2d[1] | |||
| fmla v27.2d, v5.2d, v14.2d[1] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v30.2d, v4.2d, v15.2d[1] | |||
| fmla v31.2d, v5.2d, v15.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v4.2d, v12.d[1] | |||
| fmla v19.2d, v5.2d, v12.d[1] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v22.2d, v4.2d, v13.d[1] | |||
| fmla v23.2d, v5.2d, v13.d[1] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v26.2d, v4.2d, v14.d[1] | |||
| fmla v27.2d, v5.2d, v14.d[1] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v30.2d, v4.2d, v15.d[1] | |||
| fmla v31.2d, v5.2d, v15.d[1] | |||
| ld1 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -245,25 +245,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v4.2d, v12.2d[1] | |||
| fmla v19.2d, v5.2d, v12.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v4.2d, v13.2d[1] | |||
| fmla v23.2d, v5.2d, v13.2d[1] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v4.2d, v14.2d[1] | |||
| fmla v27.2d, v5.2d, v14.2d[1] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v30.2d, v4.2d, v15.2d[1] | |||
| fmla v31.2d, v5.2d, v15.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v4.2d, v12.d[1] | |||
| fmla v19.2d, v5.2d, v12.d[1] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v22.2d, v4.2d, v13.d[1] | |||
| fmla v23.2d, v5.2d, v13.d[1] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v26.2d, v4.2d, v14.d[1] | |||
| fmla v27.2d, v5.2d, v14.d[1] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v30.2d, v4.2d, v15.d[1] | |||
| fmla v31.2d, v5.2d, v15.d[1] | |||
| .endm | |||
| .macro KERNEL4x8_SUB | |||
| @@ -274,25 +274,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v19.2d, v1.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v23.2d, v1.2d, v9.2d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v27.2d, v1.2d, v10.2d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v31.2d, v1.2d, v11.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v19.2d, v1.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v23.2d, v1.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v27.2d, v1.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| fmla v31.2d, v1.2d, v11.d[1] | |||
| .endm | |||
| .macro SAVE4x8 | |||
| @@ -374,17 +374,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| .endm | |||
| .macro SAVE2x8 | |||
| @@ -520,17 +520,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v29.2d, v1.2d, v9.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| fmul v25.2d, v1.2d, v9.2d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v24.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v8.2d[1] | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v28.2d, v0.2d, v9.2d[1] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -539,61 +539,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| ld1 {v4.2d, v5.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| ld1 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -602,17 +602,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -660,10 +660,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -746,10 +746,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -782,8 +782,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -813,7 +813,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -842,8 +842,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -871,7 +871,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -52,12 +52,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha0 d10 | |||
| #define alphaV0 v10.d[0] | |||
| #define alpha1 d11 | |||
| #define alphaV1 v11.d[0] | |||
| #define alpha2 d14 | |||
| #define alphaV2 v14.d[0] | |||
| #define alpha3 d15 | |||
| #define alphaV3 v15.d[0] | |||
| #define A_PRE_SIZE 2560 | |||
| #define B_PRE_SIZE 448 | |||
| #define C_PRE_SIZE 128 | |||
| // 00 origM | |||
| // 01 origN | |||
| @@ -74,8 +72,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| // 12 pCRow0 | |||
| // 13 pCRow1 | |||
| // 14 pCRow2 | |||
| // 15 pA | |||
| // 16 | |||
| // 15 pCRow3 | |||
| // 16 pA | |||
| // 17 | |||
| // 18 must save | |||
| // 19 must save | |||
| @@ -100,14 +98,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| //v05 pA1_2, pA1_3 | |||
| //v06 pA1_4, pA1_5 | |||
| //v07 pA1_6, pA1_7 | |||
| //v08 must save pB0_0, pB0_1 | |||
| //v09 must save pB0_2, pB0_3 | |||
| //v10 must save ALPHA0 | |||
| //v11 must save ALPHA1 | |||
| //v12 must save pB1_0, pB1_1 | |||
| //v13 must save pB1_2, pB1_3 | |||
| //v14 must save ALPHA2 | |||
| //v15 must save ALPHA3 | |||
| //v08 must save pB0_0 | |||
| //v09 must save pB0_1 | |||
| //v10 must save pB0_2 --> ALPHA0 | |||
| //v11 must save pB0_3 | |||
| //v12 must save pB1_0 | |||
| //v13 must save pB1_1 | |||
| //v14 must save pB1_2 | |||
| //v15 must save pB1_3 | |||
| //v16 must save C00, C01 | |||
| //v17 must save C02, C03 | |||
| //v18 C04, C05 | |||
| @@ -149,244 +147,257 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_I | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA], #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmul v22.2d, v2.2d, v9.2d[0] | |||
| fmul v23.2d, v3.2d, v9.2d[0] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| fmul v27.2d, v3.2d, v10.2d[0] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| ldp q4, q5, [pA], #32 | |||
| fmul v30.2d, v2.2d, v11.2d[0] | |||
| fmul v31.2d, v3.2d, v11.2d[0] | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v6.2d, v7.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| ldp d12, d13, [pB], #16 | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v22.2d, v2.2d, v9.d[0] | |||
| ldp d14, d15, [pB], #16 | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| fmul v30.2d, v2.2d, v11.d[0] | |||
| ldp q6, q7, [pA], #32 | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmul v31.2d, v3.2d, v11.d[0] | |||
| fmul v23.2d, v3.2d, v9.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ld1 {v4.2d}, [pA], #16 | |||
| ldp q4, q5, [pA], #32 | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| ld1 {v5.2d}, [pA], #16 | |||
| ldp d12, d13, [pB], #16 | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| ldp d12, d13, [pB] | |||
| add pB, pB, #16 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| ldp d14, d15, [pB] | |||
| add pB, pB, #16 | |||
| ldp d14, d15, [pB], #16 | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| ld1 {v6.2d}, [pA], #16 | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| ld1 {v7.2d}, [pA], #16 | |||
| ldp q6, q7, [pA], #32 | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| prfm PLDL1KEEP, [pA, #224] | |||
| prfm PLDL1KEEP, [pA, #224+64] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| ld1 {v0.2d}, [pA], #16 | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| ldp q0, q1, [pA], #32 | |||
| ld1 {v1.2d}, [pA], #16 | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| ldp d8, d9, [pB], #16 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| ld1 {v2.2d}, [pA], #16 | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| ld1 {v3.2d}, [pA], #16 | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| prfm PLDL1KEEP, [pB, #640] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v6.2d, v13.2d[0] | |||
| fmla v23.2d, v7.2d, v13.2d[0] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v6.2d, v14.2d[0] | |||
| fmla v27.2d, v7.2d, v14.2d[0] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v30.2d, v6.2d, v15.2d[0] | |||
| fmla v31.2d, v7.2d, v15.2d[0] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v22.2d, v6.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v14.d[0] | |||
| fmla v30.2d, v6.2d, v15.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v23.2d, v7.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v14.d[0] | |||
| fmla v31.2d, v7.2d, v15.d[0] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| ldp d8, d9, [pB] | |||
| add pB, pB, #16 | |||
| ldp d10, d11, [pB] | |||
| add pB, pB, #16 | |||
| ldp q0, q1, [pA], #32 | |||
| ldp d8, d9, [pB], #16 | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| ldp d10, d11, [pB], #16 | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| ldp q2, q3, [pA], #32 | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v22.2d, v2.2d, v9.d[0] | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v2.2d, v9.2d[0] | |||
| fmla v23.2d, v3.2d, v9.2d[0] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v26.2d, v2.2d, v10.2d[0] | |||
| fmla v27.2d, v3.2d, v10.2d[0] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v2.2d, v11.2d[0] | |||
| fmla v31.2d, v3.2d, v11.2d[0] | |||
| prfm PLDL1KEEP, [pA, #A_PRE_SIZE+64] | |||
| fmla v26.2d, v2.2d, v10.d[0] | |||
| fmla v30.2d, v2.2d, v11.d[0] | |||
| prfm PLDL1KEEP, [pB, #B_PRE_SIZE] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v27.2d, v3.2d, v10.d[0] | |||
| fmla v31.2d, v3.2d, v11.d[0] | |||
| fmla v23.2d, v3.2d, v9.d[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v0.2d, v1.2d}, [pCRow0] | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ldp q0, q1, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d}, [pCRow0] | |||
| stp q0, q1, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #C_PRE_SIZE] | |||
| ld1 {v2.2d, v3.2d}, [pCRow0] | |||
| ldp q2, q3, [pCRow0] | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v2.2d, v3.2d}, [pCRow0] | |||
| stp q2, q3, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| ld1 {v4.2d, v5.2d}, [pCRow1] | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ldp q4, q5, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d}, [pCRow1] | |||
| stp q4, q5, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| prfm PLDL2KEEP, [pCRow1, #C_PRE_SIZE] | |||
| ld1 {v6.2d, v7.2d}, [pCRow1] | |||
| ldp q6, q7, [pCRow1] | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| st1 {v6.2d, v7.2d}, [pCRow1] | |||
| stp q6, q7, [pCRow1] | |||
| add pCRow1, pCRow1, #32 | |||
| ld1 {v0.2d, v1.2d}, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ldp q0, q1, [pCRow2] | |||
| fmla v0.2d, v24.2d, alphaV0 | |||
| fmla v1.2d, v25.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d}, [pCRow2] | |||
| stp q0, q1, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld1 {v2.2d, v3.2d}, [pCRow2] | |||
| prfm PLDL2KEEP, [pCRow2, #C_PRE_SIZE] | |||
| ldp q2, q3, [pCRow2] | |||
| fmla v2.2d, v26.2d, alphaV0 | |||
| fmla v3.2d, v27.2d, alphaV0 | |||
| st1 {v2.2d, v3.2d}, [pCRow2] | |||
| stp q2, q3, [pCRow2] | |||
| add pCRow2, pCRow2, #32 | |||
| ld1 {v4.2d, v5.2d}, [pCRow3] | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ldp q4, q5, [pCRow3] | |||
| fmla v4.2d, v28.2d, alphaV0 | |||
| fmla v5.2d, v29.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d}, [pCRow3] | |||
| stp q4, q5, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow3, #C_PRE_SIZE] | |||
| ld1 {v6.2d, v7.2d}, [pCRow3] | |||
| ldp q6, q7, [pCRow3] | |||
| fmla v6.2d, v30.2d, alphaV0 | |||
| fmla v7.2d, v31.2d, alphaV0 | |||
| st1 {v6.2d, v7.2d}, [pCRow3] | |||
| stp q6, q7, [pCRow3] | |||
| add pCRow3, pCRow3, #32 | |||
| prfm PLDL2KEEP, [pCRow0, #128] | |||
| prfm PLDL2KEEP, [pCRow1, #128] | |||
| prfm PLDL2KEEP, [pCRow2, #128] | |||
| prfm PLDL2KEEP, [pCRow3, #128] | |||
| .endm | |||
| /******************************************************************************/ | |||
| @@ -408,44 +419,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d, v9.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| fmla v9.2d, v25.2d, alphaV1 | |||
| fmla v9.2d, v25.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV2 | |||
| fmla v13.2d, v29.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| fmla v13.2d, v29.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -467,13 +479,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -481,19 +494,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow2, pCRow1, LDC | |||
| ld1 {v8.2d}, [pCRow2] | |||
| fmla v8.2d, v24.2d, alphaV2 | |||
| fmla v8.2d, v24.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow2] | |||
| add pCRow1, pCRow2, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v28.2d, alphaV3 | |||
| fmla v12.2d, v28.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -518,6 +531,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x4 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -531,7 +545,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v12.d}[0], [pCRow2] | |||
| ld1 {v12.d}[1], [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.d}[0], [pCRow2] | |||
| st1 {v12.d}[1], [pCRow1] | |||
| @@ -559,32 +573,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV1 | |||
| fmla v2.2d, v18.2d, alphaV2 | |||
| fmla v3.2d, v19.2d, alphaV3 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| ld1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| fmla v4.2d, v20.2d, alphaV0 | |||
| fmla v5.2d, v21.2d, alphaV1 | |||
| fmla v6.2d, v22.2d, alphaV2 | |||
| fmla v7.2d, v23.2d, alphaV3 | |||
| fmla v5.2d, v21.2d, alphaV0 | |||
| fmla v6.2d, v22.2d, alphaV0 | |||
| fmla v7.2d, v23.2d, alphaV0 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -605,23 +620,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow1, pCRow0, LDC | |||
| ld1 {v12.2d, v13.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV2 | |||
| fmla v13.2d, v21.2d, alphaV3 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| fmla v13.2d, v21.2d, alphaV0 | |||
| st1 {v12.2d, v13.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -641,11 +657,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -653,7 +670,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v12.2d}, [pCRow1] | |||
| fmla v12.2d, v20.2d, alphaV1 | |||
| fmla v12.2d, v20.2d, alphaV0 | |||
| st1 {v12.2d}, [pCRow1] | |||
| add pCRow0, pCRow0, #16 | |||
| @@ -672,10 +689,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| fmov alpha0, alpha | |||
| add pCRow1 , pCRow0, LDC | |||
| ld1 {v8.d}[0], [pCRow0] | |||
| @@ -706,18 +724,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| fmla v0.2d, v16.2d, alphaV0 | |||
| fmla v1.2d, v17.2d, alphaV1 | |||
| fmla v2.2d, v18.2d, alphaV2 | |||
| fmla v3.2d, v19.2d, alphaV3 | |||
| fmla v1.2d, v17.2d, alphaV0 | |||
| fmla v2.2d, v18.2d, alphaV0 | |||
| fmla v3.2d, v19.2d, alphaV0 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #64 | |||
| @@ -738,14 +757,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d, v9.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| fmla v9.2d, v17.2d, alphaV1 | |||
| fmla v9.2d, v17.2d, alphaV0 | |||
| st1 {v8.2d, v9.2d}, [pCRow0] | |||
| add pCRow0, pCRow0, #32 | |||
| @@ -765,10 +785,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| fmov alpha0, alpha | |||
| ld1 {v8.2d}, [pCRow0] | |||
| fmla v8.2d, v16.2d, alphaV0 | |||
| st1 {v8.2d}, [pCRow0] | |||
| @@ -793,6 +814,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro SAVE1x1 | |||
| fmov alpha0, alpha | |||
| ldr d8, [pCRow0] | |||
| fmadd d8, d16, alpha0, d8 | |||
| str d8, [pCRow0] | |||
| @@ -820,6 +842,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| prfm PLDL1KEEP, [origPB] | |||
| prfm PLDL1KEEP, [origPA] | |||
| fmov alpha, d0 | |||
| lsl LDC, LDC, #3 // ldc = ldc * 8 | |||
| @@ -838,6 +863,7 @@ dgemm_kernel_L4_BEGIN: | |||
| add pCRow1, pCRow0, LDC | |||
| add pCRow2, pCRow1, LDC | |||
| add pCRow3, pCRow2, LDC | |||
| add pC, pCRow3, LDC | |||
| mov pA, origPA // pA = start of A array | |||
| @@ -849,6 +875,7 @@ dgemm_kernel_L4_M8_BEGIN: | |||
| cmp counterI, #0 | |||
| ble dgemm_kernel_L4_M4_BEGIN | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_20: | |||
| mov pB, origPB | |||
| @@ -868,8 +895,8 @@ dgemm_kernel_L4_M8_20: | |||
| subs counterL, counterL, #2 // subtract 2 | |||
| ble dgemm_kernel_L4_M8_22a | |||
| .align 5 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22: | |||
| KERNEL8x4_M1 | |||
| @@ -884,7 +911,7 @@ dgemm_kernel_L4_M8_22: | |||
| subs counterL, counterL, #1 | |||
| bgt dgemm_kernel_L4_M8_22 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_22a: | |||
| KERNEL8x4_M1 | |||
| @@ -898,6 +925,7 @@ dgemm_kernel_L4_M8_22a: | |||
| b dgemm_kernel_L4_M8_44 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_32: | |||
| tst counterL, #1 | |||
| @@ -923,6 +951,7 @@ dgemm_kernel_L4_M8_44: | |||
| ands counterL , origK, #7 | |||
| ble dgemm_kernel_L4_M8_100 | |||
| .align 5 | |||
| dgemm_kernel_L4_M8_46: | |||
| KERNEL8x4_SUB | |||
| @@ -931,6 +960,9 @@ dgemm_kernel_L4_M8_46: | |||
| bne dgemm_kernel_L4_M8_46 | |||
| dgemm_kernel_L4_M8_100: | |||
| prfm PLDL1KEEP, [pA] | |||
| prfm PLDL1KEEP, [pA, #64] | |||
| prfm PLDL1KEEP, [origPB] | |||
| SAVE8x4 | |||
| @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v29.2d, v1.2d, v9.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| fmul v25.2d, v1.2d, v9.2d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v24.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v8.2d[1] | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v28.2d, v0.2d, v9.2d[1] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| ld1 {v4.2d, v5.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| ld1 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -283,10 +283,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -361,10 +361,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -395,8 +395,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -424,7 +424,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -451,8 +451,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -479,7 +479,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v18.2d, v0.2d, v8.2d[1] | |||
| fmul v19.2d, v1.2d, v8.2d[1] | |||
| fmul v20.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v9.2d[0] | |||
| fmul v22.2d, v0.2d, v9.2d[1] | |||
| fmul v23.2d, v1.2d, v9.2d[1] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| fmul v25.2d, v1.2d, v10.2d[0] | |||
| fmul v26.2d, v0.2d, v10.2d[1] | |||
| fmul v27.2d, v1.2d, v10.2d[1] | |||
| fmul v28.2d, v0.2d, v11.2d[0] | |||
| fmul v29.2d, v1.2d, v11.2d[0] | |||
| fmul v30.2d, v0.2d, v11.2d[1] | |||
| fmul v31.2d, v1.2d, v11.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v0.2d, v8.d[1] | |||
| fmul v19.2d, v1.2d, v8.d[1] | |||
| fmul v20.2d, v0.2d, v9.d[0] | |||
| fmul v21.2d, v1.2d, v9.d[0] | |||
| fmul v22.2d, v0.2d, v9.d[1] | |||
| fmul v23.2d, v1.2d, v9.d[1] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| fmul v25.2d, v1.2d, v10.d[0] | |||
| fmul v26.2d, v0.2d, v10.d[1] | |||
| fmul v27.2d, v1.2d, v10.d[1] | |||
| fmul v28.2d, v0.2d, v11.d[0] | |||
| fmul v29.2d, v1.2d, v11.d[0] | |||
| fmul v30.2d, v0.2d, v11.d[1] | |||
| fmul v31.2d, v1.2d, v11.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v19.2d, v1.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v23.2d, v1.2d, v9.2d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v27.2d, v1.2d, v10.2d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v31.2d, v1.2d, v11.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v19.2d, v1.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v23.2d, v1.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v27.2d, v1.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| fmla v31.2d, v1.2d, v11.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v4.2d, v12.2d[1] | |||
| fmla v19.2d, v5.2d, v12.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v4.2d, v13.2d[1] | |||
| fmla v23.2d, v5.2d, v13.2d[1] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v4.2d, v14.2d[1] | |||
| fmla v27.2d, v5.2d, v14.2d[1] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v30.2d, v4.2d, v15.2d[1] | |||
| fmla v31.2d, v5.2d, v15.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v4.2d, v12.d[1] | |||
| fmla v19.2d, v5.2d, v12.d[1] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v22.2d, v4.2d, v13.d[1] | |||
| fmla v23.2d, v5.2d, v13.d[1] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v26.2d, v4.2d, v14.d[1] | |||
| fmla v27.2d, v5.2d, v14.d[1] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v30.2d, v4.2d, v15.d[1] | |||
| fmla v31.2d, v5.2d, v15.d[1] | |||
| ld1 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v4.2d, v12.2d[1] | |||
| fmla v19.2d, v5.2d, v12.2d[1] | |||
| fmla v20.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v13.2d[0] | |||
| fmla v22.2d, v4.2d, v13.2d[1] | |||
| fmla v23.2d, v5.2d, v13.2d[1] | |||
| fmla v24.2d, v4.2d, v14.2d[0] | |||
| fmla v25.2d, v5.2d, v14.2d[0] | |||
| fmla v26.2d, v4.2d, v14.2d[1] | |||
| fmla v27.2d, v5.2d, v14.2d[1] | |||
| fmla v28.2d, v4.2d, v15.2d[0] | |||
| fmla v29.2d, v5.2d, v15.2d[0] | |||
| fmla v30.2d, v4.2d, v15.2d[1] | |||
| fmla v31.2d, v5.2d, v15.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v4.2d, v12.d[1] | |||
| fmla v19.2d, v5.2d, v12.d[1] | |||
| fmla v20.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v13.d[0] | |||
| fmla v22.2d, v4.2d, v13.d[1] | |||
| fmla v23.2d, v5.2d, v13.d[1] | |||
| fmla v24.2d, v4.2d, v14.d[0] | |||
| fmla v25.2d, v5.2d, v14.d[0] | |||
| fmla v26.2d, v4.2d, v14.d[1] | |||
| fmla v27.2d, v5.2d, v14.d[1] | |||
| fmla v28.2d, v4.2d, v15.d[0] | |||
| fmla v29.2d, v5.2d, v15.d[0] | |||
| fmla v30.2d, v4.2d, v15.d[1] | |||
| fmla v31.2d, v5.2d, v15.d[1] | |||
| .endm | |||
| .macro KERNEL4x8_SUB | |||
| @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v19.2d, v1.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v23.2d, v1.2d, v9.2d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v25.2d, v1.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v27.2d, v1.2d, v10.2d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v29.2d, v1.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v31.2d, v1.2d, v11.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v19.2d, v1.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v23.2d, v1.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v25.2d, v1.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v27.2d, v1.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v29.2d, v1.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| fmla v31.2d, v1.2d, v11.d[1] | |||
| .endm | |||
| .macro SAVE4x8 | |||
| @@ -369,17 +369,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v10.2d, v11.2d}, [pB] | |||
| add pB, pB, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v18.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v18.2d, v0.2d, v8.d[1] | |||
| fmla v20.2d, v0.2d, v9.2d[0] | |||
| fmla v22.2d, v0.2d, v9.2d[1] | |||
| fmla v20.2d, v0.2d, v9.d[0] | |||
| fmla v22.2d, v0.2d, v9.d[1] | |||
| fmla v24.2d, v0.2d, v10.2d[0] | |||
| fmla v26.2d, v0.2d, v10.2d[1] | |||
| fmla v24.2d, v0.2d, v10.d[0] | |||
| fmla v26.2d, v0.2d, v10.d[1] | |||
| fmla v28.2d, v0.2d, v11.2d[0] | |||
| fmla v30.2d, v0.2d, v11.2d[1] | |||
| fmla v28.2d, v0.2d, v11.d[0] | |||
| fmla v30.2d, v0.2d, v11.d[1] | |||
| .endm | |||
| .macro SAVE2x8 | |||
| @@ -499,17 +499,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v29.2d, v1.2d, v9.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| fmul v25.2d, v1.2d, v9.2d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v24.2d, v0.2d, v9.2d[0] | |||
| fmul v21.2d, v1.2d, v8.2d[1] | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v28.2d, v0.2d, v9.2d[1] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| ld1 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -518,61 +518,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| ld1 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| ld1 {v4.2d, v5.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| ld1 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| ld1 {v0.2d, v1.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -581,17 +581,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -635,10 +635,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -713,10 +713,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -747,8 +747,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -776,7 +776,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -803,8 +803,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -831,7 +831,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -157,25 +157,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| fmul v17.2d, v1.2d, v8.2d[0] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| fmul v19.2d, v3.2d, v8.2d[0] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| fmul v21.2d, v1.2d, v8.2d[1] | |||
| fmul v22.2d, v2.2d, v8.2d[1] | |||
| fmul v23.2d, v3.2d, v8.2d[1] | |||
| fmul v24.2d, v0.2d, v9.2d[0] | |||
| fmul v25.2d, v1.2d, v9.2d[0] | |||
| fmul v26.2d, v2.2d, v9.2d[0] | |||
| fmul v27.2d, v3.2d, v9.2d[0] | |||
| fmul v28.2d, v0.2d, v9.2d[1] | |||
| fmul v29.2d, v1.2d, v9.2d[1] | |||
| fmul v30.2d, v2.2d, v9.2d[1] | |||
| fmul v31.2d, v3.2d, v9.2d[1] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| fmul v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| fmul v19.2d, v3.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| fmul v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| fmul v23.2d, v3.2d, v8.d[1] | |||
| fmul v24.2d, v0.2d, v9.d[0] | |||
| fmul v25.2d, v1.2d, v9.d[0] | |||
| fmul v26.2d, v2.2d, v9.d[0] | |||
| fmul v27.2d, v3.2d, v9.d[0] | |||
| fmul v28.2d, v0.2d, v9.d[1] | |||
| fmul v29.2d, v1.2d, v9.d[1] | |||
| fmul v30.2d, v2.2d, v9.d[1] | |||
| fmul v31.2d, v3.2d, v9.d[1] | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| @@ -186,25 +186,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v26.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v30.2d, v2.2d, v9.2d[1] | |||
| fmla v31.2d, v3.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| ld1 {v4.2d, v5.2d}, [pA] | |||
| add pA, pA, #32 | |||
| @@ -217,25 +217,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v22.2d, v6.2d, v12.2d[1] | |||
| fmla v23.2d, v7.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v13.2d[0] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v30.2d, v6.2d, v13.2d[1] | |||
| fmla v31.2d, v7.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| @@ -248,25 +248,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.2d, v4.2d, v12.2d[0] | |||
| fmla v17.2d, v5.2d, v12.2d[0] | |||
| fmla v18.2d, v6.2d, v12.2d[0] | |||
| fmla v19.2d, v7.2d, v12.2d[0] | |||
| fmla v20.2d, v4.2d, v12.2d[1] | |||
| fmla v21.2d, v5.2d, v12.2d[1] | |||
| fmla v22.2d, v6.2d, v12.2d[1] | |||
| fmla v23.2d, v7.2d, v12.2d[1] | |||
| fmla v24.2d, v4.2d, v13.2d[0] | |||
| fmla v25.2d, v5.2d, v13.2d[0] | |||
| fmla v26.2d, v6.2d, v13.2d[0] | |||
| fmla v27.2d, v7.2d, v13.2d[0] | |||
| fmla v28.2d, v4.2d, v13.2d[1] | |||
| fmla v29.2d, v5.2d, v13.2d[1] | |||
| fmla v30.2d, v6.2d, v13.2d[1] | |||
| fmla v31.2d, v7.2d, v13.2d[1] | |||
| fmla v16.2d, v4.2d, v12.d[0] | |||
| fmla v17.2d, v5.2d, v12.d[0] | |||
| fmla v18.2d, v6.2d, v12.d[0] | |||
| fmla v19.2d, v7.2d, v12.d[0] | |||
| fmla v20.2d, v4.2d, v12.d[1] | |||
| fmla v21.2d, v5.2d, v12.d[1] | |||
| fmla v22.2d, v6.2d, v12.d[1] | |||
| fmla v23.2d, v7.2d, v12.d[1] | |||
| fmla v24.2d, v4.2d, v13.d[0] | |||
| fmla v25.2d, v5.2d, v13.d[0] | |||
| fmla v26.2d, v6.2d, v13.d[0] | |||
| fmla v27.2d, v7.2d, v13.d[0] | |||
| fmla v28.2d, v4.2d, v13.d[1] | |||
| fmla v29.2d, v5.2d, v13.d[1] | |||
| fmla v30.2d, v6.2d, v13.d[1] | |||
| fmla v31.2d, v7.2d, v13.d[1] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -277,25 +277,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v26.2d, v2.2d, v9.2d[0] | |||
| fmla v27.2d, v3.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v30.2d, v2.2d, v9.2d[1] | |||
| fmla v31.2d, v3.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v26.2d, v2.2d, v9.d[0] | |||
| fmla v27.2d, v3.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v30.2d, v2.2d, v9.d[1] | |||
| fmla v31.2d, v3.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -351,17 +351,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v29.2d, v1.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v29.2d, v1.2d, v9.d[1] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v25.2d, v1.2d, v9.2d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v25.2d, v1.2d, v9.d[0] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -406,10 +406,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v24.2d, v0.2d, v9.2d[0] | |||
| fmla v28.2d, v0.2d, v9.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v24.2d, v0.2d, v9.d[0] | |||
| fmla v28.2d, v0.2d, v9.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -490,15 +490,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v22.2d, v2.2d, v8.2d[1] | |||
| fmla v23.2d, v3.2d, v8.2d[1] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| fmla v22.2d, v2.2d, v8.d[1] | |||
| fmla v23.2d, v3.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -534,10 +534,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v21.2d, v1.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| fmla v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -568,8 +568,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v20.2d, v0.2d, v8.2d[1] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v20.2d, v0.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -597,7 +597,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr d0 , [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2d, v8.2d, v0.2d[0] | |||
| fmla v16.2d, v8.2d, v0.d[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -629,10 +629,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v18.2d, v2.2d, v8.2d[0] | |||
| fmla v19.2d, v3.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| fmla v18.2d, v2.2d, v8.d[0] | |||
| fmla v19.2d, v3.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -660,8 +660,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d, v1.2d}, [pA] | |||
| add pA , pA, #32 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v17.2d, v1.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| fmla v17.2d, v1.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -686,7 +686,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2d}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2d, v0.2d, v8.2d[0] | |||
| fmla v16.2d, v0.2d, v8.d[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -158,25 +158,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v18.4s, v2.4s, v8.2s[0] | |||
| fmul v19.4s, v3.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v22.4s, v2.4s, v8.2s[1] | |||
| fmul v23.4s, v3.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v26.4s, v2.4s, v9.2s[0] | |||
| fmul v27.4s, v3.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v30.4s, v2.4s, v9.2s[1] | |||
| fmul v31.4s, v3.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v18.4s, v2.4s, v8.s[0] | |||
| fmul v19.4s, v3.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v22.4s, v2.4s, v8.s[1] | |||
| fmul v23.4s, v3.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v26.4s, v2.4s, v9.s[0] | |||
| fmul v27.4s, v3.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| fmul v30.4s, v2.4s, v9.s[1] | |||
| fmul v31.4s, v3.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -191,25 +191,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v26.4s, v2.4s, v9.2s[0] | |||
| fmla v27.4s, v3.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v30.4s, v2.4s, v9.2s[1] | |||
| fmla v31.4s, v3.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v26.4s, v2.4s, v9.s[0] | |||
| fmla v27.4s, v3.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| fmla v30.4s, v2.4s, v9.s[1] | |||
| fmla v31.4s, v3.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -224,25 +224,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v18.4s, v6.4s, v12.2s[0] | |||
| fmla v19.4s, v7.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v22.4s, v6.4s, v12.2s[1] | |||
| fmla v23.4s, v7.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v26.4s, v6.4s, v13.2s[0] | |||
| fmla v27.4s, v7.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v30.4s, v6.4s, v13.2s[1] | |||
| fmla v31.4s, v7.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v18.4s, v6.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v22.4s, v6.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v26.4s, v6.4s, v13.s[0] | |||
| fmla v27.4s, v7.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| fmla v30.4s, v6.4s, v13.s[1] | |||
| fmla v31.4s, v7.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -257,25 +257,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v18.4s, v6.4s, v12.2s[0] | |||
| fmla v19.4s, v7.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v22.4s, v6.4s, v12.2s[1] | |||
| fmla v23.4s, v7.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v26.4s, v6.4s, v13.2s[0] | |||
| fmla v27.4s, v7.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v30.4s, v6.4s, v13.2s[1] | |||
| fmla v31.4s, v7.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v18.4s, v6.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v22.4s, v6.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v26.4s, v6.4s, v13.s[0] | |||
| fmla v27.4s, v7.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| fmla v30.4s, v6.4s, v13.s[1] | |||
| fmla v31.4s, v7.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL16x4_SUB | |||
| @@ -290,25 +290,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v26.4s, v2.4s, v9.2s[0] | |||
| fmla v27.4s, v3.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v30.4s, v2.4s, v9.2s[1] | |||
| fmla v31.4s, v3.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v26.4s, v2.4s, v9.s[0] | |||
| fmla v27.4s, v3.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| fmla v30.4s, v2.4s, v9.s[1] | |||
| fmla v31.4s, v3.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE16x4 | |||
| @@ -370,14 +370,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -388,14 +388,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -406,14 +406,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -424,14 +424,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -442,14 +442,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -501,17 +501,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.2s, v0.2s, v8.2s[0] | |||
| fmul v29.2s, v1.2s, v9.2s[1] | |||
| fmul v16.2s, v0.2s, v8.s[0] | |||
| fmul v29.2s, v1.2s, v9.s[1] | |||
| fmul v20.2s, v0.2s, v8.2s[1] | |||
| fmul v25.2s, v1.2s, v9.2s[0] | |||
| fmul v20.2s, v0.2s, v8.s[1] | |||
| fmul v25.2s, v1.2s, v9.s[0] | |||
| fmul v24.2s, v0.2s, v9.2s[0] | |||
| fmul v21.2s, v1.2s, v8.2s[1] | |||
| fmul v24.2s, v0.2s, v9.s[0] | |||
| fmul v21.2s, v1.2s, v8.s[1] | |||
| fmul v28.2s, v0.2s, v9.2s[1] | |||
| fmul v17.2s, v1.2s, v8.2s[0] | |||
| fmul v28.2s, v0.2s, v9.s[1] | |||
| fmul v17.2s, v1.2s, v8.s[0] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -520,61 +520,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v4.2s, v5.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| ld1 {v0.2s, v1.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -583,17 +583,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -638,10 +638,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -729,15 +729,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE16x2 | |||
| @@ -777,11 +777,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -817,10 +817,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -852,8 +852,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -882,7 +882,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA] | |||
| add pA, pA, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -918,10 +918,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE16x1 | |||
| @@ -951,8 +951,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -978,8 +978,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -1004,7 +1004,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA , pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -192,164 +192,164 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA_0] | |||
| add pA_0, pA_0, #16 | |||
| fmul v16.4s, v0.4s, v8.4s[0] | |||
| fmul v20.4s, v0.4s, v8.4s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| ld1 {v2.4s}, [pA_1] | |||
| add pA_1, pA_1, #16 | |||
| fmul v24.4s, v0.4s, v8.4s[2] | |||
| fmul v28.4s, v0.4s, v8.4s[3] | |||
| fmul v24.4s, v0.4s, v8.s[2] | |||
| fmul v28.4s, v0.4s, v8.s[3] | |||
| ld1 {v4.4s}, [pA_2] | |||
| add pA_2, pA_2, #16 | |||
| fmul v17.4s, v2.4s, v8.4s[0] | |||
| fmul v21.4s, v2.4s, v8.4s[1] | |||
| fmul v17.4s, v2.4s, v8.s[0] | |||
| fmul v21.4s, v2.4s, v8.s[1] | |||
| ld1 {v6.4s}, [pA_3] | |||
| add pA_3, pA_3, #16 | |||
| fmul v25.4s, v2.4s, v8.4s[2] | |||
| fmul v29.4s, v2.4s, v8.4s[3] | |||
| fmul v25.4s, v2.4s, v8.s[2] | |||
| fmul v29.4s, v2.4s, v8.s[3] | |||
| ld1 {v12.4s}, [pB] // for next round | |||
| add pB, pB, #16 | |||
| fmul v18.4s, v4.4s, v8.4s[0] | |||
| fmul v19.4s, v6.4s, v8.4s[0] | |||
| fmul v18.4s, v4.4s, v8.s[0] | |||
| fmul v19.4s, v6.4s, v8.s[0] | |||
| ld1 {v1.4s}, [pA_0] // for next round | |||
| add pA_0, pA_0, #16 | |||
| fmul v22.4s, v4.4s, v8.4s[1] | |||
| fmul v23.4s, v6.4s, v8.4s[1] | |||
| fmul v22.4s, v4.4s, v8.s[1] | |||
| fmul v23.4s, v6.4s, v8.s[1] | |||
| ld1 {v3.4s}, [pA_1] // for next round | |||
| add pA_1, pA_1, #16 | |||
| fmul v26.4s, v4.4s, v8.4s[2] | |||
| fmul v27.4s, v6.4s, v8.4s[2] | |||
| fmul v26.4s, v4.4s, v8.s[2] | |||
| fmul v27.4s, v6.4s, v8.s[2] | |||
| ld1 {v5.4s}, [pA_2] // for next round | |||
| add pA_2, pA_2, #16 | |||
| fmul v30.4s, v4.4s, v8.4s[3] | |||
| fmul v31.4s, v6.4s, v8.4s[3] | |||
| fmul v30.4s, v4.4s, v8.s[3] | |||
| fmul v31.4s, v6.4s, v8.s[3] | |||
| ld1 {v7.4s}, [pA_3] // for next round | |||
| add pA_3, pA_3, #16 | |||
| .endm | |||
| .macro KERNEL16x4_M2 | |||
| fmla v16.4s, v1.4s, v12.4s[0] | |||
| fmla v17.4s, v3.4s, v12.4s[0] | |||
| fmla v16.4s, v1.4s, v12.s[0] | |||
| fmla v17.4s, v3.4s, v12.s[0] | |||
| ld1 {v8.4s}, [pB] // for next round | |||
| add pB, pB, #16 | |||
| fmla v18.4s, v5.4s, v12.4s[0] | |||
| fmla v19.4s, v7.4s, v12.4s[0] | |||
| fmla v18.4s, v5.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| ld1 {v0.4s}, [pA_0] // for next round | |||
| add pA_0, pA_0, #16 | |||
| fmla v20.4s, v1.4s, v12.4s[1] | |||
| fmla v21.4s, v3.4s, v12.4s[1] | |||
| fmla v20.4s, v1.4s, v12.s[1] | |||
| fmla v21.4s, v3.4s, v12.s[1] | |||
| ld1 {v2.4s}, [pA_1] // for next round | |||
| add pA_1, pA_1, #16 | |||
| fmla v22.4s, v5.4s, v12.4s[1] | |||
| fmla v23.4s, v7.4s, v12.4s[1] | |||
| fmla v22.4s, v5.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| ld1 {v4.4s}, [pA_2] // for next round | |||
| add pA_2, pA_2, #16 | |||
| fmla v24.4s, v1.4s, v12.4s[2] | |||
| fmla v25.4s, v3.4s, v12.4s[2] | |||
| fmla v24.4s, v1.4s, v12.s[2] | |||
| fmla v25.4s, v3.4s, v12.s[2] | |||
| ld1 {v6.4s}, [pA_3] // for next round | |||
| add pA_3, pA_3, #16 | |||
| fmla v26.4s, v5.4s, v12.4s[2] | |||
| fmla v27.4s, v7.4s, v12.4s[2] | |||
| fmla v26.4s, v5.4s, v12.s[2] | |||
| fmla v27.4s, v7.4s, v12.s[2] | |||
| prfm PLDL1KEEP, [pA_2, #512] | |||
| fmla v28.4s, v1.4s, v12.4s[3] | |||
| fmla v29.4s, v3.4s, v12.4s[3] | |||
| fmla v28.4s, v1.4s, v12.s[3] | |||
| fmla v29.4s, v3.4s, v12.s[3] | |||
| prfm PLDL1KEEP, [pA_3, #512] | |||
| fmla v30.4s, v5.4s, v12.4s[3] | |||
| fmla v31.4s, v7.4s, v12.4s[3] | |||
| fmla v30.4s, v5.4s, v12.s[3] | |||
| fmla v31.4s, v7.4s, v12.s[3] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| .endm | |||
| .macro KERNEL16x4_M1 | |||
| fmla v16.4s, v0.4s, v8.4s[0] | |||
| fmla v17.4s, v2.4s, v8.4s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v2.4s, v8.s[0] | |||
| ld1 {v12.4s}, [pB] // for next round | |||
| add pB, pB, #16 | |||
| fmla v18.4s, v4.4s, v8.4s[0] | |||
| fmla v19.4s, v6.4s, v8.4s[0] | |||
| fmla v18.4s, v4.4s, v8.s[0] | |||
| fmla v19.4s, v6.4s, v8.s[0] | |||
| ld1 {v1.4s}, [pA_0] // for next round | |||
| add pA_0, pA_0, #16 | |||
| fmla v20.4s, v0.4s, v8.4s[1] | |||
| fmla v21.4s, v2.4s, v8.4s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v2.4s, v8.s[1] | |||
| ld1 {v3.4s}, [pA_1] // for next round | |||
| add pA_1, pA_1, #16 | |||
| fmla v22.4s, v4.4s, v8.4s[1] | |||
| fmla v23.4s, v6.4s, v8.4s[1] | |||
| fmla v22.4s, v4.4s, v8.s[1] | |||
| fmla v23.4s, v6.4s, v8.s[1] | |||
| ld1 {v5.4s}, [pA_2] // for next round | |||
| add pA_2, pA_2, #16 | |||
| fmla v24.4s, v0.4s, v8.4s[2] | |||
| fmla v25.4s, v2.4s, v8.4s[2] | |||
| fmla v24.4s, v0.4s, v8.s[2] | |||
| fmla v25.4s, v2.4s, v8.s[2] | |||
| ld1 {v7.4s}, [pA_3] // for next round | |||
| add pA_3, pA_3, #16 | |||
| fmla v26.4s, v4.4s, v8.4s[2] | |||
| fmla v27.4s, v6.4s, v8.4s[2] | |||
| fmla v26.4s, v4.4s, v8.s[2] | |||
| fmla v27.4s, v6.4s, v8.s[2] | |||
| prfm PLDL1KEEP, [pA_0, #512] | |||
| fmla v28.4s, v0.4s, v8.4s[3] | |||
| fmla v29.4s, v2.4s, v8.4s[3] | |||
| fmla v28.4s, v0.4s, v8.s[3] | |||
| fmla v29.4s, v2.4s, v8.s[3] | |||
| prfm PLDL1KEEP, [pA_1, #512] | |||
| fmla v30.4s, v4.4s, v8.4s[3] | |||
| fmla v31.4s, v6.4s, v8.4s[3] | |||
| fmla v30.4s, v4.4s, v8.s[3] | |||
| fmla v31.4s, v6.4s, v8.s[3] | |||
| .endm | |||
| .macro KERNEL16x4_E | |||
| fmla v16.4s, v1.4s, v12.4s[0] | |||
| fmla v17.4s, v3.4s, v12.4s[0] | |||
| fmla v18.4s, v5.4s, v12.4s[0] | |||
| fmla v19.4s, v7.4s, v12.4s[0] | |||
| fmla v20.4s, v1.4s, v12.4s[1] | |||
| fmla v21.4s, v3.4s, v12.4s[1] | |||
| fmla v22.4s, v5.4s, v12.4s[1] | |||
| fmla v23.4s, v7.4s, v12.4s[1] | |||
| fmla v24.4s, v1.4s, v12.4s[2] | |||
| fmla v25.4s, v3.4s, v12.4s[2] | |||
| fmla v26.4s, v5.4s, v12.4s[2] | |||
| fmla v27.4s, v7.4s, v12.4s[2] | |||
| fmla v28.4s, v1.4s, v12.4s[3] | |||
| fmla v29.4s, v3.4s, v12.4s[3] | |||
| fmla v30.4s, v5.4s, v12.4s[3] | |||
| fmla v31.4s, v7.4s, v12.4s[3] | |||
| fmla v16.4s, v1.4s, v12.s[0] | |||
| fmla v17.4s, v3.4s, v12.s[0] | |||
| fmla v18.4s, v5.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| fmla v20.4s, v1.4s, v12.s[1] | |||
| fmla v21.4s, v3.4s, v12.s[1] | |||
| fmla v22.4s, v5.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| fmla v24.4s, v1.4s, v12.s[2] | |||
| fmla v25.4s, v3.4s, v12.s[2] | |||
| fmla v26.4s, v5.4s, v12.s[2] | |||
| fmla v27.4s, v7.4s, v12.s[2] | |||
| fmla v28.4s, v1.4s, v12.s[3] | |||
| fmla v29.4s, v3.4s, v12.s[3] | |||
| fmla v30.4s, v5.4s, v12.s[3] | |||
| fmla v31.4s, v7.4s, v12.s[3] | |||
| .endm | |||
| .macro KERNEL16x4_SUB | |||
| @@ -359,34 +359,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA_0] | |||
| add pA_0, pA_0, #16 | |||
| fmla v16.4s, v0.4s, v8.4s[0] | |||
| fmla v20.4s, v0.4s, v8.4s[1] | |||
| fmla v24.4s, v0.4s, v8.4s[2] | |||
| fmla v28.4s, v0.4s, v8.4s[3] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v8.s[2] | |||
| fmla v28.4s, v0.4s, v8.s[3] | |||
| ld1 {v2.4s}, [pA_1] | |||
| add pA_1, pA_1, #16 | |||
| fmla v17.4s, v2.4s, v8.4s[0] | |||
| fmla v21.4s, v2.4s, v8.4s[1] | |||
| fmla v25.4s, v2.4s, v8.4s[2] | |||
| fmla v29.4s, v2.4s, v8.4s[3] | |||
| fmla v17.4s, v2.4s, v8.s[0] | |||
| fmla v21.4s, v2.4s, v8.s[1] | |||
| fmla v25.4s, v2.4s, v8.s[2] | |||
| fmla v29.4s, v2.4s, v8.s[3] | |||
| ld1 {v4.4s}, [pA_2] | |||
| add pA_2, pA_2, #16 | |||
| fmla v18.4s, v4.4s, v8.4s[0] | |||
| fmla v22.4s, v4.4s, v8.4s[1] | |||
| fmla v26.4s, v4.4s, v8.4s[2] | |||
| fmla v30.4s, v4.4s, v8.4s[3] | |||
| fmla v18.4s, v4.4s, v8.s[0] | |||
| fmla v22.4s, v4.4s, v8.s[1] | |||
| fmla v26.4s, v4.4s, v8.s[2] | |||
| fmla v30.4s, v4.4s, v8.s[3] | |||
| ld1 {v6.4s}, [pA_3] | |||
| add pA_3, pA_3, #16 | |||
| fmla v19.4s, v6.4s, v8.4s[0] | |||
| fmla v23.4s, v6.4s, v8.4s[1] | |||
| fmla v27.4s, v6.4s, v8.4s[2] | |||
| fmla v31.4s, v6.4s, v8.4s[3] | |||
| fmla v19.4s, v6.4s, v8.s[0] | |||
| fmla v23.4s, v6.4s, v8.s[1] | |||
| fmla v27.4s, v6.4s, v8.s[2] | |||
| fmla v31.4s, v6.4s, v8.s[3] | |||
| .endm | |||
| .macro SAVE16x4 | |||
| @@ -456,28 +456,28 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA_0] | |||
| add pA_0, pA_0, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v2.2s, v3.2s}, [pA_1] | |||
| add pA_1, pA_1, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v18.2s, v2.2s, v8.2s[0] | |||
| fmla v31.2s, v3.2s, v9.2s[1] | |||
| fmla v22.2s, v2.2s, v8.2s[1] | |||
| fmla v27.2s, v3.2s, v9.2s[0] | |||
| fmla v18.2s, v2.2s, v8.s[0] | |||
| fmla v31.2s, v3.2s, v9.s[1] | |||
| fmla v22.2s, v2.2s, v8.s[1] | |||
| fmla v27.2s, v3.2s, v9.s[0] | |||
| fmla v26.2s, v2.2s, v9.2s[0] | |||
| fmla v23.2s, v3.2s, v8.2s[1] | |||
| fmla v30.2s, v2.2s, v9.2s[1] | |||
| fmla v19.2s, v3.2s, v8.2s[0] | |||
| fmla v26.2s, v2.2s, v9.s[0] | |||
| fmla v23.2s, v3.2s, v8.s[1] | |||
| fmla v30.2s, v2.2s, v9.s[1] | |||
| fmla v19.2s, v3.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -556,17 +556,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA_0] | |||
| add pA_0, pA_0, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -614,10 +614,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA_0] | |||
| add pA_0, pA_0, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -700,10 +700,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA_0] | |||
| add pA_0, pA_0, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -736,8 +736,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA_0] | |||
| add pA_0, pA_0, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -767,7 +767,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA_0] | |||
| add pA_0, pA_0, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -796,8 +796,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA_0] | |||
| add pA_0 , pA_0, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -825,7 +825,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA_0] | |||
| add pA_0 , pA_0, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -157,22 +157,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v4.4s[0] | |||
| fmul v17.4s, v1.4s, v4.4s[0] | |||
| fmul v18.4s, v0.4s, v4.4s[1] | |||
| fmul v19.4s, v1.4s, v4.4s[1] | |||
| fmul v20.4s, v0.4s, v4.4s[2] | |||
| fmul v21.4s, v1.4s, v4.4s[2] | |||
| fmul v22.4s, v0.4s, v4.4s[3] | |||
| fmul v23.4s, v1.4s, v4.4s[3] | |||
| fmul v24.4s, v0.4s, v5.4s[0] | |||
| fmul v25.4s, v1.4s, v5.4s[0] | |||
| fmul v26.4s, v0.4s, v5.4s[1] | |||
| fmul v27.4s, v1.4s, v5.4s[1] | |||
| fmul v28.4s, v0.4s, v5.4s[2] | |||
| fmul v29.4s, v1.4s, v5.4s[2] | |||
| fmul v30.4s, v0.4s, v5.4s[3] | |||
| fmul v31.4s, v1.4s, v5.4s[3] | |||
| fmul v16.4s, v0.4s, v4.s[0] | |||
| fmul v17.4s, v1.4s, v4.s[0] | |||
| fmul v18.4s, v0.4s, v4.s[1] | |||
| fmul v19.4s, v1.4s, v4.s[1] | |||
| fmul v20.4s, v0.4s, v4.s[2] | |||
| fmul v21.4s, v1.4s, v4.s[2] | |||
| fmul v22.4s, v0.4s, v4.s[3] | |||
| fmul v23.4s, v1.4s, v4.s[3] | |||
| fmul v24.4s, v0.4s, v5.s[0] | |||
| fmul v25.4s, v1.4s, v5.s[0] | |||
| fmul v26.4s, v0.4s, v5.s[1] | |||
| fmul v27.4s, v1.4s, v5.s[1] | |||
| fmul v28.4s, v0.4s, v5.s[2] | |||
| fmul v29.4s, v1.4s, v5.s[2] | |||
| fmul v30.4s, v0.4s, v5.s[3] | |||
| fmul v31.4s, v1.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -185,22 +185,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_M1 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v17.4s, v1.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v19.4s, v1.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v21.4s, v1.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v23.4s, v1.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v25.4s, v1.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v27.4s, v1.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v29.4s, v1.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v31.4s, v1.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v17.4s, v1.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v19.4s, v1.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v21.4s, v1.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v23.4s, v1.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v25.4s, v1.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v27.4s, v1.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v29.4s, v1.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| fmla v31.4s, v1.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -213,22 +213,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_M2 | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v17.4s, v3.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v19.4s, v3.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v21.4s, v3.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v23.4s, v3.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v25.4s, v3.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v27.4s, v3.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v29.4s, v3.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v31.4s, v3.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v17.4s, v3.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v19.4s, v3.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v21.4s, v3.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v23.4s, v3.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v25.4s, v3.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v27.4s, v3.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v29.4s, v3.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| fmla v31.4s, v3.4s, v7.s[3] | |||
| ld1 {v4.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -241,22 +241,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_E | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v17.4s, v3.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v19.4s, v3.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v21.4s, v3.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v23.4s, v3.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v25.4s, v3.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v27.4s, v3.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v29.4s, v3.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v31.4s, v3.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v17.4s, v3.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v19.4s, v3.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v21.4s, v3.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v23.4s, v3.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v25.4s, v3.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v27.4s, v3.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v29.4s, v3.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| fmla v31.4s, v3.4s, v7.s[3] | |||
| .endm | |||
| .macro KERNEL8x8_SUB | |||
| @@ -269,22 +269,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v17.4s, v1.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v19.4s, v1.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v21.4s, v1.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v23.4s, v1.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v25.4s, v1.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v27.4s, v1.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v29.4s, v1.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v31.4s, v1.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v17.4s, v1.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v19.4s, v1.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v21.4s, v1.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v23.4s, v1.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v25.4s, v1.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v27.4s, v1.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v29.4s, v1.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| fmla v31.4s, v1.4s, v5.s[3] | |||
| .endm | |||
| .macro SAVE8x8 | |||
| @@ -367,14 +367,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v4.4s[0] | |||
| fmul v18.4s, v0.4s, v4.4s[1] | |||
| fmul v20.4s, v0.4s, v4.4s[2] | |||
| fmul v22.4s, v0.4s, v4.4s[3] | |||
| fmul v24.4s, v0.4s, v5.4s[0] | |||
| fmul v26.4s, v0.4s, v5.4s[1] | |||
| fmul v28.4s, v0.4s, v5.4s[2] | |||
| fmul v30.4s, v0.4s, v5.4s[3] | |||
| fmul v16.4s, v0.4s, v4.s[0] | |||
| fmul v18.4s, v0.4s, v4.s[1] | |||
| fmul v20.4s, v0.4s, v4.s[2] | |||
| fmul v22.4s, v0.4s, v4.s[3] | |||
| fmul v24.4s, v0.4s, v5.s[0] | |||
| fmul v26.4s, v0.4s, v5.s[1] | |||
| fmul v28.4s, v0.4s, v5.s[2] | |||
| fmul v30.4s, v0.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -385,14 +385,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M1 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -403,14 +403,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M2 | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| ld1 {v4.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -421,14 +421,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_E | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| .endm | |||
| .macro KERNEL4x8_SUB | |||
| @@ -439,14 +439,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| .endm | |||
| .macro SAVE4x8 | |||
| @@ -520,14 +520,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v4.4s[0] | |||
| fmla v18.2s, v0.2s, v4.4s[1] | |||
| fmla v20.2s, v0.2s, v4.4s[2] | |||
| fmla v22.2s, v0.2s, v4.4s[3] | |||
| fmla v24.2s, v0.2s, v5.4s[0] | |||
| fmla v26.2s, v0.2s, v5.4s[1] | |||
| fmla v28.2s, v0.2s, v5.4s[2] | |||
| fmla v30.2s, v0.2s, v5.4s[3] | |||
| fmla v16.2s, v0.2s, v4.s[0] | |||
| fmla v18.2s, v0.2s, v4.s[1] | |||
| fmla v20.2s, v0.2s, v4.s[2] | |||
| fmla v22.2s, v0.2s, v4.s[3] | |||
| fmla v24.2s, v0.2s, v5.s[0] | |||
| fmla v26.2s, v0.2s, v5.s[1] | |||
| fmla v28.2s, v0.2s, v5.s[2] | |||
| fmla v30.2s, v0.2s, v5.s[3] | |||
| .endm | |||
| .macro SAVE2x8 | |||
| @@ -601,14 +601,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0, [pA] | |||
| add pA, pA, #4 | |||
| fmla s16, s0, v4.4s[0] | |||
| fmla s18, s0, v4.4s[1] | |||
| fmla s20, s0, v4.4s[2] | |||
| fmla s22, s0, v4.4s[3] | |||
| fmla s24, s0, v5.4s[0] | |||
| fmla s26, s0, v5.4s[1] | |||
| fmla s28, s0, v5.4s[2] | |||
| fmla s30, s0, v5.4s[3] | |||
| fmla s16, s0, v4.s[0] | |||
| fmla s18, s0, v4.s[1] | |||
| fmla s20, s0, v4.s[2] | |||
| fmla s22, s0, v4.s[3] | |||
| fmla s24, s0, v5.s[0] | |||
| fmla s26, s0, v5.s[1] | |||
| fmla s28, s0, v5.s[2] | |||
| fmla s30, s0, v5.s[3] | |||
| .endm | |||
| .macro SAVE1x8 | |||
| @@ -682,14 +682,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -700,14 +700,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -718,14 +718,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -736,14 +736,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -754,14 +754,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -814,17 +814,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.2s, v0.2s, v8.2s[0] | |||
| fmul v29.2s, v1.2s, v9.2s[1] | |||
| fmul v16.2s, v0.2s, v8.s[0] | |||
| fmul v29.2s, v1.2s, v9.s[1] | |||
| fmul v20.2s, v0.2s, v8.2s[1] | |||
| fmul v25.2s, v1.2s, v9.2s[0] | |||
| fmul v20.2s, v0.2s, v8.s[1] | |||
| fmul v25.2s, v1.2s, v9.s[0] | |||
| fmul v24.2s, v0.2s, v9.2s[0] | |||
| fmul v21.2s, v1.2s, v8.2s[1] | |||
| fmul v24.2s, v0.2s, v9.s[0] | |||
| fmul v21.2s, v1.2s, v8.s[1] | |||
| fmul v28.2s, v0.2s, v9.2s[1] | |||
| fmul v17.2s, v1.2s, v8.2s[0] | |||
| fmul v28.2s, v0.2s, v9.s[1] | |||
| fmul v17.2s, v1.2s, v8.s[0] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -833,61 +833,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v4.2s, v5.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| ld1 {v0.2s, v1.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -896,17 +896,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -951,10 +951,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -1034,11 +1034,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -1074,10 +1074,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -1109,8 +1109,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -1139,7 +1139,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA] | |||
| add pA, pA, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -1169,8 +1169,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -1196,8 +1196,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -1222,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA , pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -161,25 +161,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v18.4s, v2.4s, v8.2s[0] | |||
| fmul v19.4s, v3.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v22.4s, v2.4s, v8.2s[1] | |||
| fmul v23.4s, v3.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v26.4s, v2.4s, v9.2s[0] | |||
| fmul v27.4s, v3.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v30.4s, v2.4s, v9.2s[1] | |||
| fmul v31.4s, v3.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v18.4s, v2.4s, v8.s[0] | |||
| fmul v19.4s, v3.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v22.4s, v2.4s, v8.s[1] | |||
| fmul v23.4s, v3.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v26.4s, v2.4s, v9.s[0] | |||
| fmul v27.4s, v3.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| fmul v30.4s, v2.4s, v9.s[1] | |||
| fmul v31.4s, v3.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -194,25 +194,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v26.4s, v2.4s, v9.2s[0] | |||
| fmla v27.4s, v3.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v30.4s, v2.4s, v9.2s[1] | |||
| fmla v31.4s, v3.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v26.4s, v2.4s, v9.s[0] | |||
| fmla v27.4s, v3.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| fmla v30.4s, v2.4s, v9.s[1] | |||
| fmla v31.4s, v3.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -227,25 +227,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v18.4s, v6.4s, v12.2s[0] | |||
| fmla v19.4s, v7.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v22.4s, v6.4s, v12.2s[1] | |||
| fmla v23.4s, v7.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v26.4s, v6.4s, v13.2s[0] | |||
| fmla v27.4s, v7.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v30.4s, v6.4s, v13.2s[1] | |||
| fmla v31.4s, v7.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v18.4s, v6.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v22.4s, v6.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v26.4s, v6.4s, v13.s[0] | |||
| fmla v27.4s, v7.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| fmla v30.4s, v6.4s, v13.s[1] | |||
| fmla v31.4s, v7.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -260,25 +260,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL16x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v18.4s, v6.4s, v12.2s[0] | |||
| fmla v19.4s, v7.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v22.4s, v6.4s, v12.2s[1] | |||
| fmla v23.4s, v7.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v26.4s, v6.4s, v13.2s[0] | |||
| fmla v27.4s, v7.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v30.4s, v6.4s, v13.2s[1] | |||
| fmla v31.4s, v7.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v18.4s, v6.4s, v12.s[0] | |||
| fmla v19.4s, v7.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v22.4s, v6.4s, v12.s[1] | |||
| fmla v23.4s, v7.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v26.4s, v6.4s, v13.s[0] | |||
| fmla v27.4s, v7.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| fmla v30.4s, v6.4s, v13.s[1] | |||
| fmla v31.4s, v7.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL16x4_SUB | |||
| @@ -293,25 +293,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v26.4s, v2.4s, v9.2s[0] | |||
| fmla v27.4s, v3.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v30.4s, v2.4s, v9.2s[1] | |||
| fmla v31.4s, v3.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v26.4s, v2.4s, v9.s[0] | |||
| fmla v27.4s, v3.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| fmla v30.4s, v2.4s, v9.s[1] | |||
| fmla v31.4s, v3.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE16x4 | |||
| @@ -369,14 +369,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -387,14 +387,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -405,14 +405,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -423,14 +423,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -441,14 +441,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -496,17 +496,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.2s, v0.2s, v8.2s[0] | |||
| fmul v29.2s, v1.2s, v9.2s[1] | |||
| fmul v16.2s, v0.2s, v8.s[0] | |||
| fmul v29.2s, v1.2s, v9.s[1] | |||
| fmul v20.2s, v0.2s, v8.2s[1] | |||
| fmul v25.2s, v1.2s, v9.2s[0] | |||
| fmul v20.2s, v0.2s, v8.s[1] | |||
| fmul v25.2s, v1.2s, v9.s[0] | |||
| fmul v24.2s, v0.2s, v9.2s[0] | |||
| fmul v21.2s, v1.2s, v8.2s[1] | |||
| fmul v24.2s, v0.2s, v9.s[0] | |||
| fmul v21.2s, v1.2s, v8.s[1] | |||
| fmul v28.2s, v0.2s, v9.2s[1] | |||
| fmul v17.2s, v1.2s, v8.2s[0] | |||
| fmul v28.2s, v0.2s, v9.s[1] | |||
| fmul v17.2s, v1.2s, v8.s[0] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -515,61 +515,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v4.2s, v5.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| ld1 {v0.2s, v1.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -578,17 +578,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -633,10 +633,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -718,15 +718,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v22.4s, v2.4s, v8.2s[1] | |||
| fmla v23.4s, v3.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v22.4s, v2.4s, v8.s[1] | |||
| fmla v23.4s, v3.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE16x2 | |||
| @@ -764,11 +764,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -802,10 +802,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -837,8 +837,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -866,7 +866,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA] | |||
| add pA, pA, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -901,10 +901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v3.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v18.4s, v2.4s, v8.2s[0] | |||
| fmla v19.4s, v3.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v18.4s, v2.4s, v8.s[0] | |||
| fmla v19.4s, v3.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE16x1 | |||
| @@ -934,8 +934,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -961,8 +961,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -987,7 +987,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA , pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -147,17 +147,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.2s, v0.2s, v8.2s[0] | |||
| fmul v29.2s, v1.2s, v9.2s[1] | |||
| fmul v16.2s, v0.2s, v8.s[0] | |||
| fmul v29.2s, v1.2s, v9.s[1] | |||
| fmul v20.2s, v0.2s, v8.2s[1] | |||
| fmul v25.2s, v1.2s, v9.2s[0] | |||
| fmul v20.2s, v0.2s, v8.s[1] | |||
| fmul v25.2s, v1.2s, v9.s[0] | |||
| fmul v24.2s, v0.2s, v9.2s[0] | |||
| fmul v21.2s, v1.2s, v8.2s[1] | |||
| fmul v24.2s, v0.2s, v9.s[0] | |||
| fmul v21.2s, v1.2s, v8.s[1] | |||
| fmul v28.2s, v0.2s, v9.2s[1] | |||
| fmul v17.2s, v1.2s, v8.2s[0] | |||
| fmul v28.2s, v0.2s, v9.s[1] | |||
| fmul v17.2s, v1.2s, v8.s[0] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -166,61 +166,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v4.2s, v5.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| ld1 {v0.2s, v1.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -229,17 +229,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -280,10 +280,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -353,10 +353,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -386,8 +386,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -414,7 +414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA] | |||
| add pA, pA, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -440,8 +440,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -468,7 +468,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA , pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -159,22 +159,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v4.4s[0] | |||
| fmul v17.4s, v1.4s, v4.4s[0] | |||
| fmul v18.4s, v0.4s, v4.4s[1] | |||
| fmul v19.4s, v1.4s, v4.4s[1] | |||
| fmul v20.4s, v0.4s, v4.4s[2] | |||
| fmul v21.4s, v1.4s, v4.4s[2] | |||
| fmul v22.4s, v0.4s, v4.4s[3] | |||
| fmul v23.4s, v1.4s, v4.4s[3] | |||
| fmul v24.4s, v0.4s, v5.4s[0] | |||
| fmul v25.4s, v1.4s, v5.4s[0] | |||
| fmul v26.4s, v0.4s, v5.4s[1] | |||
| fmul v27.4s, v1.4s, v5.4s[1] | |||
| fmul v28.4s, v0.4s, v5.4s[2] | |||
| fmul v29.4s, v1.4s, v5.4s[2] | |||
| fmul v30.4s, v0.4s, v5.4s[3] | |||
| fmul v31.4s, v1.4s, v5.4s[3] | |||
| fmul v16.4s, v0.4s, v4.s[0] | |||
| fmul v17.4s, v1.4s, v4.s[0] | |||
| fmul v18.4s, v0.4s, v4.s[1] | |||
| fmul v19.4s, v1.4s, v4.s[1] | |||
| fmul v20.4s, v0.4s, v4.s[2] | |||
| fmul v21.4s, v1.4s, v4.s[2] | |||
| fmul v22.4s, v0.4s, v4.s[3] | |||
| fmul v23.4s, v1.4s, v4.s[3] | |||
| fmul v24.4s, v0.4s, v5.s[0] | |||
| fmul v25.4s, v1.4s, v5.s[0] | |||
| fmul v26.4s, v0.4s, v5.s[1] | |||
| fmul v27.4s, v1.4s, v5.s[1] | |||
| fmul v28.4s, v0.4s, v5.s[2] | |||
| fmul v29.4s, v1.4s, v5.s[2] | |||
| fmul v30.4s, v0.4s, v5.s[3] | |||
| fmul v31.4s, v1.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -187,22 +187,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_M1 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v17.4s, v1.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v19.4s, v1.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v21.4s, v1.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v23.4s, v1.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v25.4s, v1.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v27.4s, v1.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v29.4s, v1.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v31.4s, v1.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v17.4s, v1.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v19.4s, v1.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v21.4s, v1.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v23.4s, v1.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v25.4s, v1.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v27.4s, v1.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v29.4s, v1.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| fmla v31.4s, v1.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -215,22 +215,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_M2 | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v17.4s, v3.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v19.4s, v3.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v21.4s, v3.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v23.4s, v3.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v25.4s, v3.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v27.4s, v3.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v29.4s, v3.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v31.4s, v3.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v17.4s, v3.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v19.4s, v3.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v21.4s, v3.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v23.4s, v3.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v25.4s, v3.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v27.4s, v3.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v29.4s, v3.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| fmla v31.4s, v3.4s, v7.s[3] | |||
| ld1 {v4.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -243,22 +243,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x8_E | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v17.4s, v3.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v19.4s, v3.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v21.4s, v3.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v23.4s, v3.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v25.4s, v3.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v27.4s, v3.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v29.4s, v3.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v31.4s, v3.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v17.4s, v3.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v19.4s, v3.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v21.4s, v3.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v23.4s, v3.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v25.4s, v3.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v27.4s, v3.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v29.4s, v3.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| fmla v31.4s, v3.4s, v7.s[3] | |||
| .endm | |||
| .macro KERNEL8x8_SUB | |||
| @@ -271,22 +271,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v17.4s, v1.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v19.4s, v1.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v21.4s, v1.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v23.4s, v1.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v25.4s, v1.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v27.4s, v1.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v29.4s, v1.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v31.4s, v1.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v17.4s, v1.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v19.4s, v1.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v21.4s, v1.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v23.4s, v1.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v25.4s, v1.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v27.4s, v1.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v29.4s, v1.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| fmla v31.4s, v1.4s, v5.s[3] | |||
| .endm | |||
| .macro SAVE8x8 | |||
| @@ -361,14 +361,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v4.4s[0] | |||
| fmul v18.4s, v0.4s, v4.4s[1] | |||
| fmul v20.4s, v0.4s, v4.4s[2] | |||
| fmul v22.4s, v0.4s, v4.4s[3] | |||
| fmul v24.4s, v0.4s, v5.4s[0] | |||
| fmul v26.4s, v0.4s, v5.4s[1] | |||
| fmul v28.4s, v0.4s, v5.4s[2] | |||
| fmul v30.4s, v0.4s, v5.4s[3] | |||
| fmul v16.4s, v0.4s, v4.s[0] | |||
| fmul v18.4s, v0.4s, v4.s[1] | |||
| fmul v20.4s, v0.4s, v4.s[2] | |||
| fmul v22.4s, v0.4s, v4.s[3] | |||
| fmul v24.4s, v0.4s, v5.s[0] | |||
| fmul v26.4s, v0.4s, v5.s[1] | |||
| fmul v28.4s, v0.4s, v5.s[2] | |||
| fmul v30.4s, v0.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -379,14 +379,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M1 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| ld1 {v6.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -397,14 +397,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_M2 | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| ld1 {v4.4s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -415,14 +415,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x8_E | |||
| fmla v16.4s, v2.4s, v6.4s[0] | |||
| fmla v18.4s, v2.4s, v6.4s[1] | |||
| fmla v20.4s, v2.4s, v6.4s[2] | |||
| fmla v22.4s, v2.4s, v6.4s[3] | |||
| fmla v24.4s, v2.4s, v7.4s[0] | |||
| fmla v26.4s, v2.4s, v7.4s[1] | |||
| fmla v28.4s, v2.4s, v7.4s[2] | |||
| fmla v30.4s, v2.4s, v7.4s[3] | |||
| fmla v16.4s, v2.4s, v6.s[0] | |||
| fmla v18.4s, v2.4s, v6.s[1] | |||
| fmla v20.4s, v2.4s, v6.s[2] | |||
| fmla v22.4s, v2.4s, v6.s[3] | |||
| fmla v24.4s, v2.4s, v7.s[0] | |||
| fmla v26.4s, v2.4s, v7.s[1] | |||
| fmla v28.4s, v2.4s, v7.s[2] | |||
| fmla v30.4s, v2.4s, v7.s[3] | |||
| .endm | |||
| .macro KERNEL4x8_SUB | |||
| @@ -433,14 +433,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v4.4s[0] | |||
| fmla v18.4s, v0.4s, v4.4s[1] | |||
| fmla v20.4s, v0.4s, v4.4s[2] | |||
| fmla v22.4s, v0.4s, v4.4s[3] | |||
| fmla v24.4s, v0.4s, v5.4s[0] | |||
| fmla v26.4s, v0.4s, v5.4s[1] | |||
| fmla v28.4s, v0.4s, v5.4s[2] | |||
| fmla v30.4s, v0.4s, v5.4s[3] | |||
| fmla v16.4s, v0.4s, v4.s[0] | |||
| fmla v18.4s, v0.4s, v4.s[1] | |||
| fmla v20.4s, v0.4s, v4.s[2] | |||
| fmla v22.4s, v0.4s, v4.s[3] | |||
| fmla v24.4s, v0.4s, v5.s[0] | |||
| fmla v26.4s, v0.4s, v5.s[1] | |||
| fmla v28.4s, v0.4s, v5.s[2] | |||
| fmla v30.4s, v0.4s, v5.s[3] | |||
| .endm | |||
| .macro SAVE4x8 | |||
| @@ -514,14 +514,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v4.4s[0] | |||
| fmla v18.2s, v0.2s, v4.4s[1] | |||
| fmla v20.2s, v0.2s, v4.4s[2] | |||
| fmla v22.2s, v0.2s, v4.4s[3] | |||
| fmla v24.2s, v0.2s, v5.4s[0] | |||
| fmla v26.2s, v0.2s, v5.4s[1] | |||
| fmla v28.2s, v0.2s, v5.4s[2] | |||
| fmla v30.2s, v0.2s, v5.4s[3] | |||
| fmla v16.2s, v0.2s, v4.s[0] | |||
| fmla v18.2s, v0.2s, v4.s[1] | |||
| fmla v20.2s, v0.2s, v4.s[2] | |||
| fmla v22.2s, v0.2s, v4.s[3] | |||
| fmla v24.2s, v0.2s, v5.s[0] | |||
| fmla v26.2s, v0.2s, v5.s[1] | |||
| fmla v28.2s, v0.2s, v5.s[2] | |||
| fmla v30.2s, v0.2s, v5.s[3] | |||
| .endm | |||
| .macro SAVE2x8 | |||
| @@ -595,14 +595,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0, [pA] | |||
| add pA, pA, #4 | |||
| fmla s16, s0, v4.4s[0] | |||
| fmla s18, s0, v4.4s[1] | |||
| fmla s20, s0, v4.4s[2] | |||
| fmla s22, s0, v4.4s[3] | |||
| fmla s24, s0, v5.4s[0] | |||
| fmla s26, s0, v5.4s[1] | |||
| fmla s28, s0, v5.4s[2] | |||
| fmla s30, s0, v5.4s[3] | |||
| fmla s16, s0, v4.s[0] | |||
| fmla s18, s0, v4.s[1] | |||
| fmla s20, s0, v4.s[2] | |||
| fmla s22, s0, v4.s[3] | |||
| fmla s24, s0, v5.s[0] | |||
| fmla s26, s0, v5.s[1] | |||
| fmla s28, s0, v5.s[2] | |||
| fmla s30, s0, v5.s[3] | |||
| .endm | |||
| .macro SAVE1x8 | |||
| @@ -676,14 +676,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.4s, v0.4s, v8.2s[0] | |||
| fmul v17.4s, v1.4s, v8.2s[0] | |||
| fmul v20.4s, v0.4s, v8.2s[1] | |||
| fmul v21.4s, v1.4s, v8.2s[1] | |||
| fmul v24.4s, v0.4s, v9.2s[0] | |||
| fmul v25.4s, v1.4s, v9.2s[0] | |||
| fmul v28.4s, v0.4s, v9.2s[1] | |||
| fmul v29.4s, v1.4s, v9.2s[1] | |||
| fmul v16.4s, v0.4s, v8.s[0] | |||
| fmul v17.4s, v1.4s, v8.s[0] | |||
| fmul v20.4s, v0.4s, v8.s[1] | |||
| fmul v21.4s, v1.4s, v8.s[1] | |||
| fmul v24.4s, v0.4s, v9.s[0] | |||
| fmul v25.4s, v1.4s, v9.s[0] | |||
| fmul v28.4s, v0.4s, v9.s[1] | |||
| fmul v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -694,14 +694,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M1 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -712,14 +712,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_M2 | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -730,14 +730,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL8x4_E | |||
| fmla v16.4s, v4.4s, v12.2s[0] | |||
| fmla v17.4s, v5.4s, v12.2s[0] | |||
| fmla v20.4s, v4.4s, v12.2s[1] | |||
| fmla v21.4s, v5.4s, v12.2s[1] | |||
| fmla v24.4s, v4.4s, v13.2s[0] | |||
| fmla v25.4s, v5.4s, v13.2s[0] | |||
| fmla v28.4s, v4.4s, v13.2s[1] | |||
| fmla v29.4s, v5.4s, v13.2s[1] | |||
| fmla v16.4s, v4.4s, v12.s[0] | |||
| fmla v17.4s, v5.4s, v12.s[0] | |||
| fmla v20.4s, v4.4s, v12.s[1] | |||
| fmla v21.4s, v5.4s, v12.s[1] | |||
| fmla v24.4s, v4.4s, v13.s[0] | |||
| fmla v25.4s, v5.4s, v13.s[0] | |||
| fmla v28.4s, v4.4s, v13.s[1] | |||
| fmla v29.4s, v5.4s, v13.s[1] | |||
| .endm | |||
| .macro KERNEL8x4_SUB | |||
| @@ -748,14 +748,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v24.4s, v0.4s, v9.2s[0] | |||
| fmla v25.4s, v1.4s, v9.2s[0] | |||
| fmla v28.4s, v0.4s, v9.2s[1] | |||
| fmla v29.4s, v1.4s, v9.2s[1] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| fmla v24.4s, v0.4s, v9.s[0] | |||
| fmla v25.4s, v1.4s, v9.s[0] | |||
| fmla v28.4s, v0.4s, v9.s[1] | |||
| fmla v29.4s, v1.4s, v9.s[1] | |||
| .endm | |||
| .macro SAVE8x4 | |||
| @@ -808,17 +808,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmul v16.2s, v0.2s, v8.2s[0] | |||
| fmul v29.2s, v1.2s, v9.2s[1] | |||
| fmul v16.2s, v0.2s, v8.s[0] | |||
| fmul v29.2s, v1.2s, v9.s[1] | |||
| fmul v20.2s, v0.2s, v8.2s[1] | |||
| fmul v25.2s, v1.2s, v9.2s[0] | |||
| fmul v20.2s, v0.2s, v8.s[1] | |||
| fmul v25.2s, v1.2s, v9.s[0] | |||
| fmul v24.2s, v0.2s, v9.2s[0] | |||
| fmul v21.2s, v1.2s, v8.2s[1] | |||
| fmul v24.2s, v0.2s, v9.s[0] | |||
| fmul v21.2s, v1.2s, v8.s[1] | |||
| fmul v28.2s, v0.2s, v9.2s[1] | |||
| fmul v17.2s, v1.2s, v8.2s[0] | |||
| fmul v28.2s, v0.2s, v9.s[1] | |||
| fmul v17.2s, v1.2s, v8.s[0] | |||
| ld1 {v12.2s, v13.2s}, [pB] | |||
| add pB, pB, #16 | |||
| @@ -827,61 +827,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| ld1 {v12.2s, v13.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| ld1 {v4.2s, v5.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| ld1 {v8.2s, v9.2s}, [pB] // For next round | |||
| add pB, pB, #16 | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| ld1 {v0.2s, v1.2s}, [pA] // For next round | |||
| add pA, pA, #16 | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| fmla v16.2s, v4.2s, v12.2s[0] | |||
| fmla v29.2s, v5.2s, v13.2s[1] | |||
| fmla v16.2s, v4.2s, v12.s[0] | |||
| fmla v29.2s, v5.2s, v13.s[1] | |||
| fmla v20.2s, v4.2s, v12.2s[1] | |||
| fmla v25.2s, v5.2s, v13.2s[0] | |||
| fmla v20.2s, v4.2s, v12.s[1] | |||
| fmla v25.2s, v5.2s, v13.s[0] | |||
| fmla v24.2s, v4.2s, v13.2s[0] | |||
| fmla v21.2s, v5.2s, v12.2s[1] | |||
| fmla v24.2s, v4.2s, v13.s[0] | |||
| fmla v21.2s, v5.2s, v12.s[1] | |||
| fmla v28.2s, v4.2s, v13.2s[1] | |||
| fmla v17.2s, v5.2s, v12.2s[0] | |||
| fmla v28.2s, v4.2s, v13.s[1] | |||
| fmla v17.2s, v5.2s, v12.s[0] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -890,17 +890,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v29.2s, v1.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v29.2s, v1.2s, v9.s[1] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v25.2s, v1.2s, v9.2s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v25.2s, v1.2s, v9.s[0] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -945,10 +945,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v24.2s, v0.2s, v9.2s[0] | |||
| fmla v28.2s, v0.2s, v9.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v24.2s, v0.2s, v9.s[0] | |||
| fmla v28.2s, v0.2s, v9.s[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -1028,11 +1028,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| fmla v20.4s, v0.4s, v8.2s[1] | |||
| fmla v21.4s, v1.4s, v8.2s[1] | |||
| fmla v20.4s, v0.4s, v8.s[1] | |||
| fmla v21.4s, v1.4s, v8.s[1] | |||
| .endm | |||
| .macro SAVE8x2 | |||
| @@ -1068,10 +1068,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v21.2s, v1.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| fmla v21.2s, v1.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -1103,8 +1103,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA, pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v20.2s, v0.2s, v8.2s[1] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v20.2s, v0.2s, v8.s[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -1133,7 +1133,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ldr s0 , [pA] | |||
| add pA, pA, #4 | |||
| fmla v16.2s, v8.2s, v0.2s[0] | |||
| fmla v16.2s, v8.2s, v0.s[0] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -1163,8 +1163,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v1.4s}, [pA] | |||
| add pA, pA, #16 | |||
| fmla v16.4s, v0.4s, v8.2s[0] | |||
| fmla v17.4s, v1.4s, v8.2s[0] | |||
| fmla v16.4s, v0.4s, v8.s[0] | |||
| fmla v17.4s, v1.4s, v8.s[0] | |||
| .endm | |||
| .macro SAVE8x1 | |||
| @@ -1190,8 +1190,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s, v1.2s}, [pA] | |||
| add pA , pA, #16 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v17.2s, v1.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| fmla v17.2s, v1.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE4x1 | |||
| @@ -1216,7 +1216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld1 {v0.2s}, [pA] | |||
| add pA , pA, #8 | |||
| fmla v16.2s, v0.2s, v8.2s[0] | |||
| fmla v16.2s, v0.2s, v8.s[0] | |||
| .endm | |||
| .macro SAVE2x1 | |||
| @@ -182,93 +182,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.2d, v0.2d, v9.2d[0] | |||
| fmls v17.2d, v0.2d, v9.d[0] | |||
| #else | |||
| fmul v17.2d, v0.2d, v9.2d[0] | |||
| fmul v17.2d, v0.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.2d[0] | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.2d[0] | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.2d, v0.2d, v9.2d[1] | |||
| fmls v21.2d, v0.2d, v9.d[1] | |||
| #else | |||
| fmul v21.2d, v0.2d, v9.2d[1] | |||
| fmul v21.2d, v0.2d, v9.d[1] | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v23.16b, v23.16b, v23.16b | |||
| fmls v23.2d, v2.2d, v9.2d[1] | |||
| fmls v23.2d, v2.2d, v9.d[1] | |||
| #else | |||
| fmul v23.2d, v2.2d, v9.2d[1] | |||
| fmul v23.2d, v2.2d, v9.d[1] | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.2d, v0.2d, v11.2d[0] | |||
| fmls v25.2d, v0.2d, v11.d[0] | |||
| #else | |||
| fmul v25.2d, v0.2d, v11.2d[0] | |||
| fmul v25.2d, v0.2d, v11.d[0] | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v27.16b, v27.16b, v27.16b | |||
| fmls v27.2d, v2.2d, v11.2d[0] | |||
| fmls v27.2d, v2.2d, v11.d[0] | |||
| #else | |||
| fmul v27.2d, v2.2d, v11.2d[0] | |||
| fmul v27.2d, v2.2d, v11.d[0] | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.2d, v0.2d, v11.2d[1] | |||
| fmls v29.2d, v0.2d, v11.d[1] | |||
| #else | |||
| fmul v29.2d, v0.2d, v11.2d[1] | |||
| fmul v29.2d, v0.2d, v11.d[1] | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| fmul v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v31.16b, v31.16b, v31.16b | |||
| fmls v31.2d, v2.2d, v11.2d[1] | |||
| fmls v31.2d, v2.2d, v11.d[1] | |||
| #else | |||
| fmul v31.2d, v2.2d, v11.2d[1] | |||
| fmul v31.2d, v2.2d, v11.d[1] | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -281,161 +281,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| OP_ri v27.2d, v2.2d, v11.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| OP_rr v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| OP_ri v31.2d, v2.2d, v11.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_rr v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| OP_ri v31.2d, v2.2d, v11.d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| OP_rr v16.2d, v4.2d, v12.2d[0] | |||
| OP_ii v16.2d, v5.2d, v13.2d[0] | |||
| OP_ri v17.2d, v4.2d, v13.2d[0] | |||
| OP_ir v17.2d, v5.2d, v12.2d[0] | |||
| OP_rr v16.2d, v4.2d, v12.d[0] | |||
| OP_ii v16.2d, v5.2d, v13.d[0] | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.2d[0] | |||
| OP_ii v18.2d, v7.2d, v13.2d[0] | |||
| OP_ri v19.2d, v6.2d, v13.2d[0] | |||
| OP_ir v19.2d, v7.2d, v12.2d[0] | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| OP_ii v18.2d, v7.2d, v13.d[0] | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.2d, v4.2d, v12.2d[1] | |||
| OP_ii v20.2d, v5.2d, v13.2d[1] | |||
| OP_ri v21.2d, v4.2d, v13.2d[1] | |||
| OP_ir v21.2d, v5.2d, v12.2d[1] | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.2d[1] | |||
| OP_ii v22.2d, v7.2d, v13.2d[1] | |||
| OP_ri v23.2d, v6.2d, v13.2d[1] | |||
| OP_ir v23.2d, v7.2d, v12.2d[1] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.2d, v4.2d, v14.2d[0] | |||
| OP_ii v24.2d, v5.2d, v15.2d[0] | |||
| OP_ri v25.2d, v4.2d, v15.2d[0] | |||
| OP_ir v25.2d, v5.2d, v14.2d[0] | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v26.2d, v6.2d, v14.2d[0] | |||
| OP_ii v26.2d, v7.2d, v15.2d[0] | |||
| OP_ri v27.2d, v6.2d, v15.2d[0] | |||
| OP_ir v27.2d, v7.2d, v14.2d[0] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.2d, v4.2d, v14.2d[1] | |||
| OP_ii v28.2d, v5.2d, v15.2d[1] | |||
| OP_ri v29.2d, v4.2d, v15.2d[1] | |||
| OP_ir v29.2d, v5.2d, v14.2d[1] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| OP_ri v29.2d, v4.2d, v15.d[1] | |||
| OP_ir v29.2d, v5.2d, v14.d[1] | |||
| OP_rr v30.2d, v6.2d, v14.2d[1] | |||
| OP_ii v30.2d, v7.2d, v15.2d[1] | |||
| OP_ri v31.2d, v6.2d, v15.2d[1] | |||
| OP_ir v31.2d, v7.2d, v14.2d[1] | |||
| OP_rr v30.2d, v6.2d, v14.d[1] | |||
| OP_ii v30.2d, v7.2d, v15.d[1] | |||
| OP_ri v31.2d, v6.2d, v15.d[1] | |||
| OP_ir v31.2d, v7.2d, v14.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| OP_rr v16.2d, v4.2d, v12.2d[0] | |||
| OP_ii v16.2d, v5.2d, v13.2d[0] | |||
| OP_ri v17.2d, v4.2d, v13.2d[0] | |||
| OP_ir v17.2d, v5.2d, v12.2d[0] | |||
| OP_rr v18.2d, v6.2d, v12.2d[0] | |||
| OP_ii v18.2d, v7.2d, v13.2d[0] | |||
| OP_ri v19.2d, v6.2d, v13.2d[0] | |||
| OP_ir v19.2d, v7.2d, v12.2d[0] | |||
| OP_rr v20.2d, v4.2d, v12.2d[1] | |||
| OP_ii v20.2d, v5.2d, v13.2d[1] | |||
| OP_ri v21.2d, v4.2d, v13.2d[1] | |||
| OP_ir v21.2d, v5.2d, v12.2d[1] | |||
| OP_rr v22.2d, v6.2d, v12.2d[1] | |||
| OP_ii v22.2d, v7.2d, v13.2d[1] | |||
| OP_ri v23.2d, v6.2d, v13.2d[1] | |||
| OP_ir v23.2d, v7.2d, v12.2d[1] | |||
| OP_rr v24.2d, v4.2d, v14.2d[0] | |||
| OP_ii v24.2d, v5.2d, v15.2d[0] | |||
| OP_ri v25.2d, v4.2d, v15.2d[0] | |||
| OP_ir v25.2d, v5.2d, v14.2d[0] | |||
| OP_rr v26.2d, v6.2d, v14.2d[0] | |||
| OP_ii v26.2d, v7.2d, v15.2d[0] | |||
| OP_ri v27.2d, v6.2d, v15.2d[0] | |||
| OP_ir v27.2d, v7.2d, v14.2d[0] | |||
| OP_rr v28.2d, v4.2d, v14.2d[1] | |||
| OP_ii v28.2d, v5.2d, v15.2d[1] | |||
| OP_ri v29.2d, v4.2d, v15.2d[1] | |||
| OP_ir v29.2d, v5.2d, v14.2d[1] | |||
| OP_rr v30.2d, v6.2d, v14.2d[1] | |||
| OP_ii v30.2d, v7.2d, v15.2d[1] | |||
| OP_ri v31.2d, v6.2d, v15.2d[1] | |||
| OP_ir v31.2d, v7.2d, v14.2d[1] | |||
| OP_rr v16.2d, v4.2d, v12.d[0] | |||
| OP_ii v16.2d, v5.2d, v13.d[0] | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| OP_ii v18.2d, v7.2d, v13.d[0] | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| OP_ri v29.2d, v4.2d, v15.d[1] | |||
| OP_ir v29.2d, v5.2d, v14.d[1] | |||
| OP_rr v30.2d, v6.2d, v14.d[1] | |||
| OP_ii v30.2d, v7.2d, v15.d[1] | |||
| OP_ri v31.2d, v6.2d, v15.d[1] | |||
| OP_ir v31.2d, v7.2d, v14.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -448,45 +448,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| OP_ri v27.2d, v2.2d, v11.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| OP_ri v31.2d, v2.2d, v11.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| OP_rr v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| OP_ri v31.2d, v2.2d, v11.d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -582,25 +582,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -669,25 +669,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pA] | |||
| add pA, pA, #16 | |||
| OP_rr d16, d0, v8.2d[0] | |||
| OP_ii d16, d1, v9.2d[0] | |||
| OP_ri d17, d0, v9.2d[0] | |||
| OP_ir d17, d1, v8.2d[0] | |||
| OP_rr d20, d0, v8.2d[1] | |||
| OP_ii d20, d1, v9.2d[1] | |||
| OP_ri d21, d0, v9.2d[1] | |||
| OP_ir d21, d1, v8.2d[1] | |||
| OP_rr d24, d0, v10.2d[0] | |||
| OP_ii d24, d1, v11.2d[0] | |||
| OP_ri d25, d0, v11.2d[0] | |||
| OP_ir d25, d1, v10.2d[0] | |||
| OP_rr d28, d0, v10.2d[1] | |||
| OP_ii d28, d1, v11.2d[1] | |||
| OP_ri d29, d0, v11.2d[1] | |||
| OP_ir d29, d1, v10.2d[1] | |||
| OP_rr d16, d0, v8.d[0] | |||
| OP_ii d16, d1, v9.d[0] | |||
| OP_ri d17, d0, v9.d[0] | |||
| OP_ir d17, d1, v8.d[0] | |||
| OP_rr d20, d0, v8.d[1] | |||
| OP_ii d20, d1, v9.d[1] | |||
| OP_ri d21, d0, v9.d[1] | |||
| OP_ir d21, d1, v8.d[1] | |||
| OP_rr d24, d0, v10.d[0] | |||
| OP_ii d24, d1, v11.d[0] | |||
| OP_ri d25, d0, v11.d[0] | |||
| OP_ir d25, d1, v10.d[0] | |||
| OP_rr d28, d0, v10.d[1] | |||
| OP_ii d28, d1, v11.d[1] | |||
| OP_ri d29, d0, v11.d[1] | |||
| OP_ir d29, d1, v10.d[1] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -756,25 +756,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -833,15 +833,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -886,15 +886,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pA] | |||
| add pA, pA, #16 | |||
| OP_rr d16, d0, v8.2d[0] | |||
| OP_ii d16, d1, v9.2d[0] | |||
| OP_ri d17, d0, v9.2d[0] | |||
| OP_ir d17, d1, v8.2d[0] | |||
| OP_rr d16, d0, v8.d[0] | |||
| OP_ii d16, d1, v9.d[0] | |||
| OP_ri d17, d0, v9.d[0] | |||
| OP_ir d17, d1, v8.d[0] | |||
| OP_rr d20, d0, v8.2d[1] | |||
| OP_ii d20, d1, v9.2d[1] | |||
| OP_ri d21, d0, v9.2d[1] | |||
| OP_ir d21, d1, v8.2d[1] | |||
| OP_rr d20, d0, v8.d[1] | |||
| OP_ii d20, d1, v9.d[1] | |||
| OP_ri d21, d0, v9.d[1] | |||
| OP_ir d21, d1, v8.d[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -185,93 +185,93 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| fmul v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| fmul v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v17.16b, v17.16b, v17.16b | |||
| fmls v17.2d, v0.2d, v9.2d[0] | |||
| fmls v17.2d, v0.2d, v9.d[0] | |||
| #else | |||
| fmul v17.2d, v0.2d, v9.2d[0] | |||
| fmul v17.2d, v0.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| fmul v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| fmul v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v19.16b, v19.16b, v19.16b | |||
| fmls v19.2d, v2.2d, v9.2d[0] | |||
| fmls v19.2d, v2.2d, v9.d[0] | |||
| #else | |||
| fmul v19.2d, v2.2d, v9.2d[0] | |||
| fmul v19.2d, v2.2d, v9.d[0] | |||
| #endif | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| fmul v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| fmul v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v21.16b, v21.16b, v21.16b | |||
| fmls v21.2d, v0.2d, v9.2d[1] | |||
| fmls v21.2d, v0.2d, v9.d[1] | |||
| #else | |||
| fmul v21.2d, v0.2d, v9.2d[1] | |||
| fmul v21.2d, v0.2d, v9.d[1] | |||
| #endif | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| fmul v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| fmul v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v23.16b, v23.16b, v23.16b | |||
| fmls v23.2d, v2.2d, v9.2d[1] | |||
| fmls v23.2d, v2.2d, v9.d[1] | |||
| #else | |||
| fmul v23.2d, v2.2d, v9.2d[1] | |||
| fmul v23.2d, v2.2d, v9.d[1] | |||
| #endif | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| fmul v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| fmul v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v25.16b, v25.16b, v25.16b | |||
| fmls v25.2d, v0.2d, v11.2d[0] | |||
| fmls v25.2d, v0.2d, v11.d[0] | |||
| #else | |||
| fmul v25.2d, v0.2d, v11.2d[0] | |||
| fmul v25.2d, v0.2d, v11.d[0] | |||
| #endif | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| fmul v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| fmul v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v27.16b, v27.16b, v27.16b | |||
| fmls v27.2d, v2.2d, v11.2d[0] | |||
| fmls v27.2d, v2.2d, v11.d[0] | |||
| #else | |||
| fmul v27.2d, v2.2d, v11.2d[0] | |||
| fmul v27.2d, v2.2d, v11.d[0] | |||
| #endif | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| fmul v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| fmul v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v29.16b, v29.16b, v29.16b | |||
| fmls v29.2d, v0.2d, v11.2d[1] | |||
| fmls v29.2d, v0.2d, v11.d[1] | |||
| #else | |||
| fmul v29.2d, v0.2d, v11.2d[1] | |||
| fmul v29.2d, v0.2d, v11.d[1] | |||
| #endif | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| fmul v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| fmul v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| #if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ | |||
| defined(RR) || defined(RC) || defined(CR) || defined(CC) | |||
| eor v31.16b, v31.16b, v31.16b | |||
| fmls v31.2d, v2.2d, v11.2d[1] | |||
| fmls v31.2d, v2.2d, v11.d[1] | |||
| #else | |||
| fmul v31.2d, v2.2d, v11.2d[1] | |||
| fmul v31.2d, v2.2d, v11.d[1] | |||
| #endif | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| ld2 {v12.2d, v13.2d}, [pB] | |||
| add pB, pB, #32 | |||
| @@ -284,161 +284,161 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .endm | |||
| .macro KERNEL4x4_M1 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| ld2 {v12.2d, v13.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| ld2 {v14.2d, v15.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| ld2 {v4.2d, v5.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| ld2 {v6.2d, v7.2d} , [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| OP_ri v27.2d, v2.2d, v11.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| OP_rr v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| OP_ri v31.2d, v2.2d, v11.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_rr v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| OP_ri v31.2d, v2.2d, v11.d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_M2 | |||
| OP_rr v16.2d, v4.2d, v12.2d[0] | |||
| OP_ii v16.2d, v5.2d, v13.2d[0] | |||
| OP_ri v17.2d, v4.2d, v13.2d[0] | |||
| OP_ir v17.2d, v5.2d, v12.2d[0] | |||
| OP_rr v16.2d, v4.2d, v12.d[0] | |||
| OP_ii v16.2d, v5.2d, v13.d[0] | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| ld2 {v8.2d, v9.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v18.2d, v6.2d, v12.2d[0] | |||
| OP_ii v18.2d, v7.2d, v13.2d[0] | |||
| OP_ri v19.2d, v6.2d, v13.2d[0] | |||
| OP_ir v19.2d, v7.2d, v12.2d[0] | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| OP_ii v18.2d, v7.2d, v13.d[0] | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| ld2 {v10.2d, v11.2d}, [pB] // For next round | |||
| add pB, pB, #32 | |||
| OP_rr v20.2d, v4.2d, v12.2d[1] | |||
| OP_ii v20.2d, v5.2d, v13.2d[1] | |||
| OP_ri v21.2d, v4.2d, v13.2d[1] | |||
| OP_ir v21.2d, v5.2d, v12.2d[1] | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| ld2 {v0.2d, v1.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v22.2d, v6.2d, v12.2d[1] | |||
| OP_ii v22.2d, v7.2d, v13.2d[1] | |||
| OP_ri v23.2d, v6.2d, v13.2d[1] | |||
| OP_ir v23.2d, v7.2d, v12.2d[1] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| ld2 {v2.2d, v3.2d}, [pA] // For next round | |||
| add pA, pA, #32 | |||
| OP_rr v24.2d, v4.2d, v14.2d[0] | |||
| OP_ii v24.2d, v5.2d, v15.2d[0] | |||
| OP_ri v25.2d, v4.2d, v15.2d[0] | |||
| OP_ir v25.2d, v5.2d, v14.2d[0] | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pA, #512] | |||
| OP_rr v26.2d, v6.2d, v14.2d[0] | |||
| OP_ii v26.2d, v7.2d, v15.2d[0] | |||
| OP_ri v27.2d, v6.2d, v15.2d[0] | |||
| OP_ir v27.2d, v7.2d, v14.2d[0] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| prfm PLDL1KEEP, [pB, #512] | |||
| OP_rr v28.2d, v4.2d, v14.2d[1] | |||
| OP_ii v28.2d, v5.2d, v15.2d[1] | |||
| OP_ri v29.2d, v4.2d, v15.2d[1] | |||
| OP_ir v29.2d, v5.2d, v14.2d[1] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| OP_ri v29.2d, v4.2d, v15.d[1] | |||
| OP_ir v29.2d, v5.2d, v14.d[1] | |||
| OP_rr v30.2d, v6.2d, v14.2d[1] | |||
| OP_ii v30.2d, v7.2d, v15.2d[1] | |||
| OP_ri v31.2d, v6.2d, v15.2d[1] | |||
| OP_ir v31.2d, v7.2d, v14.2d[1] | |||
| OP_rr v30.2d, v6.2d, v14.d[1] | |||
| OP_ii v30.2d, v7.2d, v15.d[1] | |||
| OP_ri v31.2d, v6.2d, v15.d[1] | |||
| OP_ir v31.2d, v7.2d, v14.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_E | |||
| OP_rr v16.2d, v4.2d, v12.2d[0] | |||
| OP_ii v16.2d, v5.2d, v13.2d[0] | |||
| OP_ri v17.2d, v4.2d, v13.2d[0] | |||
| OP_ir v17.2d, v5.2d, v12.2d[0] | |||
| OP_rr v18.2d, v6.2d, v12.2d[0] | |||
| OP_ii v18.2d, v7.2d, v13.2d[0] | |||
| OP_ri v19.2d, v6.2d, v13.2d[0] | |||
| OP_ir v19.2d, v7.2d, v12.2d[0] | |||
| OP_rr v20.2d, v4.2d, v12.2d[1] | |||
| OP_ii v20.2d, v5.2d, v13.2d[1] | |||
| OP_ri v21.2d, v4.2d, v13.2d[1] | |||
| OP_ir v21.2d, v5.2d, v12.2d[1] | |||
| OP_rr v22.2d, v6.2d, v12.2d[1] | |||
| OP_ii v22.2d, v7.2d, v13.2d[1] | |||
| OP_ri v23.2d, v6.2d, v13.2d[1] | |||
| OP_ir v23.2d, v7.2d, v12.2d[1] | |||
| OP_rr v24.2d, v4.2d, v14.2d[0] | |||
| OP_ii v24.2d, v5.2d, v15.2d[0] | |||
| OP_ri v25.2d, v4.2d, v15.2d[0] | |||
| OP_ir v25.2d, v5.2d, v14.2d[0] | |||
| OP_rr v26.2d, v6.2d, v14.2d[0] | |||
| OP_ii v26.2d, v7.2d, v15.2d[0] | |||
| OP_ri v27.2d, v6.2d, v15.2d[0] | |||
| OP_ir v27.2d, v7.2d, v14.2d[0] | |||
| OP_rr v28.2d, v4.2d, v14.2d[1] | |||
| OP_ii v28.2d, v5.2d, v15.2d[1] | |||
| OP_ri v29.2d, v4.2d, v15.2d[1] | |||
| OP_ir v29.2d, v5.2d, v14.2d[1] | |||
| OP_rr v30.2d, v6.2d, v14.2d[1] | |||
| OP_ii v30.2d, v7.2d, v15.2d[1] | |||
| OP_ri v31.2d, v6.2d, v15.2d[1] | |||
| OP_ir v31.2d, v7.2d, v14.2d[1] | |||
| OP_rr v16.2d, v4.2d, v12.d[0] | |||
| OP_ii v16.2d, v5.2d, v13.d[0] | |||
| OP_ri v17.2d, v4.2d, v13.d[0] | |||
| OP_ir v17.2d, v5.2d, v12.d[0] | |||
| OP_rr v18.2d, v6.2d, v12.d[0] | |||
| OP_ii v18.2d, v7.2d, v13.d[0] | |||
| OP_ri v19.2d, v6.2d, v13.d[0] | |||
| OP_ir v19.2d, v7.2d, v12.d[0] | |||
| OP_rr v20.2d, v4.2d, v12.d[1] | |||
| OP_ii v20.2d, v5.2d, v13.d[1] | |||
| OP_ri v21.2d, v4.2d, v13.d[1] | |||
| OP_ir v21.2d, v5.2d, v12.d[1] | |||
| OP_rr v22.2d, v6.2d, v12.d[1] | |||
| OP_ii v22.2d, v7.2d, v13.d[1] | |||
| OP_ri v23.2d, v6.2d, v13.d[1] | |||
| OP_ir v23.2d, v7.2d, v12.d[1] | |||
| OP_rr v24.2d, v4.2d, v14.d[0] | |||
| OP_ii v24.2d, v5.2d, v15.d[0] | |||
| OP_ri v25.2d, v4.2d, v15.d[0] | |||
| OP_ir v25.2d, v5.2d, v14.d[0] | |||
| OP_rr v26.2d, v6.2d, v14.d[0] | |||
| OP_ii v26.2d, v7.2d, v15.d[0] | |||
| OP_ri v27.2d, v6.2d, v15.d[0] | |||
| OP_ir v27.2d, v7.2d, v14.d[0] | |||
| OP_rr v28.2d, v4.2d, v14.d[1] | |||
| OP_ii v28.2d, v5.2d, v15.d[1] | |||
| OP_ri v29.2d, v4.2d, v15.d[1] | |||
| OP_ir v29.2d, v5.2d, v14.d[1] | |||
| OP_rr v30.2d, v6.2d, v14.d[1] | |||
| OP_ii v30.2d, v7.2d, v15.d[1] | |||
| OP_ri v31.2d, v6.2d, v15.d[1] | |||
| OP_ir v31.2d, v7.2d, v14.d[1] | |||
| .endm | |||
| .macro KERNEL4x4_SUB | |||
| @@ -451,45 +451,45 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v26.2d, v2.2d, v10.2d[0] | |||
| OP_ii v26.2d, v3.2d, v11.2d[0] | |||
| OP_ri v27.2d, v2.2d, v11.2d[0] | |||
| OP_ir v27.2d, v3.2d, v10.2d[0] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v30.2d, v2.2d, v10.2d[1] | |||
| OP_ii v30.2d, v3.2d, v11.2d[1] | |||
| OP_ri v31.2d, v2.2d, v11.2d[1] | |||
| OP_ir v31.2d, v3.2d, v10.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| OP_rr v26.2d, v2.2d, v10.d[0] | |||
| OP_ii v26.2d, v3.2d, v11.d[0] | |||
| OP_ri v27.2d, v2.2d, v11.d[0] | |||
| OP_ir v27.2d, v3.2d, v10.d[0] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| OP_rr v30.2d, v2.2d, v10.d[1] | |||
| OP_ii v30.2d, v3.2d, v11.d[1] | |||
| OP_ri v31.2d, v2.2d, v11.d[1] | |||
| OP_ir v31.2d, v3.2d, v10.d[1] | |||
| .endm | |||
| .macro SAVE4x4 | |||
| @@ -577,25 +577,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v24.2d, v0.2d, v10.2d[0] | |||
| OP_ii v24.2d, v1.2d, v11.2d[0] | |||
| OP_ri v25.2d, v0.2d, v11.2d[0] | |||
| OP_ir v25.2d, v1.2d, v10.2d[0] | |||
| OP_rr v28.2d, v0.2d, v10.2d[1] | |||
| OP_ii v28.2d, v1.2d, v11.2d[1] | |||
| OP_ri v29.2d, v0.2d, v11.2d[1] | |||
| OP_ir v29.2d, v1.2d, v10.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v24.2d, v0.2d, v10.d[0] | |||
| OP_ii v24.2d, v1.2d, v11.d[0] | |||
| OP_ri v25.2d, v0.2d, v11.d[0] | |||
| OP_ir v25.2d, v1.2d, v10.d[0] | |||
| OP_rr v28.2d, v0.2d, v10.d[1] | |||
| OP_ii v28.2d, v1.2d, v11.d[1] | |||
| OP_ri v29.2d, v0.2d, v11.d[1] | |||
| OP_ir v29.2d, v1.2d, v10.d[1] | |||
| .endm | |||
| .macro SAVE2x4 | |||
| @@ -660,25 +660,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pA] | |||
| add pA, pA, #16 | |||
| OP_rr d16, d0, v8.2d[0] | |||
| OP_ii d16, d1, v9.2d[0] | |||
| OP_ri d17, d0, v9.2d[0] | |||
| OP_ir d17, d1, v8.2d[0] | |||
| OP_rr d20, d0, v8.2d[1] | |||
| OP_ii d20, d1, v9.2d[1] | |||
| OP_ri d21, d0, v9.2d[1] | |||
| OP_ir d21, d1, v8.2d[1] | |||
| OP_rr d24, d0, v10.2d[0] | |||
| OP_ii d24, d1, v11.2d[0] | |||
| OP_ri d25, d0, v11.2d[0] | |||
| OP_ir d25, d1, v10.2d[0] | |||
| OP_rr d28, d0, v10.2d[1] | |||
| OP_ii d28, d1, v11.2d[1] | |||
| OP_ri d29, d0, v11.2d[1] | |||
| OP_ir d29, d1, v10.2d[1] | |||
| OP_rr d16, d0, v8.d[0] | |||
| OP_ii d16, d1, v9.d[0] | |||
| OP_ri d17, d0, v9.d[0] | |||
| OP_ir d17, d1, v8.d[0] | |||
| OP_rr d20, d0, v8.d[1] | |||
| OP_ii d20, d1, v9.d[1] | |||
| OP_ri d21, d0, v9.d[1] | |||
| OP_ir d21, d1, v8.d[1] | |||
| OP_rr d24, d0, v10.d[0] | |||
| OP_ii d24, d1, v11.d[0] | |||
| OP_ri d25, d0, v11.d[0] | |||
| OP_ir d25, d1, v10.d[0] | |||
| OP_rr d28, d0, v10.d[1] | |||
| OP_ii d28, d1, v11.d[1] | |||
| OP_ri d29, d0, v11.d[1] | |||
| OP_ir d29, d1, v10.d[1] | |||
| .endm | |||
| .macro SAVE1x4 | |||
| @@ -743,25 +743,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v2.2d, v3.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v18.2d, v2.2d, v8.2d[0] | |||
| OP_ii v18.2d, v3.2d, v9.2d[0] | |||
| OP_ri v19.2d, v2.2d, v9.2d[0] | |||
| OP_ir v19.2d, v3.2d, v8.2d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v22.2d, v2.2d, v8.2d[1] | |||
| OP_ii v22.2d, v3.2d, v9.2d[1] | |||
| OP_ri v23.2d, v2.2d, v9.2d[1] | |||
| OP_ir v23.2d, v3.2d, v8.2d[1] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v18.2d, v2.2d, v8.d[0] | |||
| OP_ii v18.2d, v3.2d, v9.d[0] | |||
| OP_ri v19.2d, v2.2d, v9.d[0] | |||
| OP_ir v19.2d, v3.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| OP_rr v22.2d, v2.2d, v8.d[1] | |||
| OP_ii v22.2d, v3.2d, v9.d[1] | |||
| OP_ri v23.2d, v2.2d, v9.d[1] | |||
| OP_ir v23.2d, v3.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE4x2 | |||
| @@ -816,15 +816,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.2d, v1.2d}, [pA] | |||
| add pA, pA, #32 | |||
| OP_rr v16.2d, v0.2d, v8.2d[0] | |||
| OP_ii v16.2d, v1.2d, v9.2d[0] | |||
| OP_ri v17.2d, v0.2d, v9.2d[0] | |||
| OP_ir v17.2d, v1.2d, v8.2d[0] | |||
| OP_rr v16.2d, v0.2d, v8.d[0] | |||
| OP_ii v16.2d, v1.2d, v9.d[0] | |||
| OP_ri v17.2d, v0.2d, v9.d[0] | |||
| OP_ir v17.2d, v1.2d, v8.d[0] | |||
| OP_rr v20.2d, v0.2d, v8.2d[1] | |||
| OP_ii v20.2d, v1.2d, v9.2d[1] | |||
| OP_ri v21.2d, v0.2d, v9.2d[1] | |||
| OP_ir v21.2d, v1.2d, v8.2d[1] | |||
| OP_rr v20.2d, v0.2d, v8.d[1] | |||
| OP_ii v20.2d, v1.2d, v9.d[1] | |||
| OP_ri v21.2d, v0.2d, v9.d[1] | |||
| OP_ir v21.2d, v1.2d, v8.d[1] | |||
| .endm | |||
| .macro SAVE2x2 | |||
| @@ -867,15 +867,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld2 {v0.d, v1.d}[0], [pA] | |||
| add pA, pA, #16 | |||
| OP_rr d16, d0, v8.2d[0] | |||
| OP_ii d16, d1, v9.2d[0] | |||
| OP_ri d17, d0, v9.2d[0] | |||
| OP_ir d17, d1, v8.2d[0] | |||
| OP_rr d16, d0, v8.d[0] | |||
| OP_ii d16, d1, v9.d[0] | |||
| OP_ri d17, d0, v9.d[0] | |||
| OP_ir d17, d1, v8.d[0] | |||
| OP_rr d20, d0, v8.2d[1] | |||
| OP_ii d20, d1, v9.2d[1] | |||
| OP_ri d21, d0, v9.2d[1] | |||
| OP_ir d21, d1, v8.2d[1] | |||
| OP_rr d20, d0, v8.d[1] | |||
| OP_ii d20, d1, v9.d[1] | |||
| OP_ri d21, d0, v9.d[1] | |||
| OP_ir d21, d1, v8.d[1] | |||
| .endm | |||
| .macro SAVE1x2 | |||
| @@ -3,14 +3,18 @@ | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = gemm_kernel_power6.S | |||
| STRMMKERNEL = strmm_kernel_16x8_power8.S | |||
| DTRMMKERNEL = dtrmm_kernel_16x4_power8.S | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| SGEMMKERNEL = gemm_kernel_power6.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_4.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| @@ -24,11 +28,15 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| CGEMMKERNEL = ../generic/zgemmkernel_2x2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| @@ -97,56 +105,56 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| #SASUMKERNEL = ../arm/asum.c | |||
| #DASUMKERNEL = ../arm/asum.c | |||
| #CASUMKERNEL = ../arm/zasum.c | |||
| #ZASUMKERNEL = ../arm/zasum.c | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| #SAXPYKERNEL = ../arm/axpy.c | |||
| #DAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| #CAXPYKERNEL = ../arm/zaxpy.c | |||
| #ZAXPYKERNEL = ../arm/zaxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| #SCOPYKERNEL = ../arm/copy.c | |||
| #DCOPYKERNEL = ../arm/copy.c | |||
| #CCOPYKERNEL = ../arm/zcopy.c | |||
| #ZCOPYKERNEL = ../arm/zcopy.c | |||
| SCOPYKERNEL = scopy.c | |||
| DCOPYKERNEL = dcopy.c | |||
| CCOPYKERNEL = ccopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| # | |||
| #SDOTKERNEL = ../arm/dot.c | |||
| #DDOTKERNEL = ../arm/dot.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| #CDOTKERNEL = ../arm/zdot.c | |||
| #ZDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| #SNRM2KERNEL = ../arm/nrm2.c | |||
| #DNRM2KERNEL = ../arm/nrm2.c | |||
| #CNRM2KERNEL = ../arm/znrm2.c | |||
| #ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| #SROTKERNEL = ../arm/rot.c | |||
| #DROTKERNEL = ../arm/rot.c | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| #CROTKERNEL = ../arm/zrot.c | |||
| #ZROTKERNEL = ../arm/zrot.c | |||
| # | |||
| #SSCALKERNEL = ../arm/scal.c | |||
| #DSCALKERNEL = ../arm/scal.c | |||
| SSCALKERNEL = sscal.c | |||
| DSCALKERNEL = dscal.c | |||
| #CSCALKERNEL = ../arm/zscal.c | |||
| #ZSCALKERNEL = ../arm/zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| # | |||
| #SSWAPKERNEL = ../arm/swap.c | |||
| #DSWAPKERNEL = ../arm/swap.c | |||
| #CSWAPKERNEL = ../arm/zswap.c | |||
| #ZSWAPKERNEL = ../arm/zswap.c | |||
| SSWAPKERNEL = sswap.c | |||
| DSWAPKERNEL = dswap.c | |||
| CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| #SGEMVNKERNEL = ../arm/gemv_n.c | |||
| #DGEMVNKERNEL = ../arm/gemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| #CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| #ZGEMVNKERNEL = ../arm/zgemv_n.c | |||
| # | |||
| #SGEMVTKERNEL = ../arm/gemv_t.c | |||
| #DGEMVTKERNEL = ../arm/gemv_t.c | |||
| #CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| #ZGEMVTKERNEL = ../arm/zgemv_t.c | |||
| #ZGEMVTKERNEL = zgemv_t_4.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #if defined(POWER8) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=4; | |||
| } | |||
| svec[0] = sum0+sum1+sum2+sum3; | |||
| svec[1] = 0.0; | |||
| svec[2] = 0.0; | |||
| svec[3] = 0.0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ip=0; | |||
| FLOAT sumf = 0.0; | |||
| FLOAT svec[4] __attribute__ ((aligned (16)));; | |||
| BLASLONG n1; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| casum_kernel_16(n1, x, svec); | |||
| sumf = svec[0] + svec[1]+svec[2]+svec[3]; | |||
| i=n1; | |||
| ip = 2 * n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||
| ip += 2; | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||
| ip += inc_x2; | |||
| i++; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); | |||
| static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "dcbt %2 , %4 \n\t" | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2 , %4 \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "xvabssp 52, 44 \n\t" | |||
| "xvabssp 53, 45 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "xvabssp 54, 46 \n\t" | |||
| "xvabssp 55, 47 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvaddsp 36, 36, 52 \n\t" | |||
| "xvaddsp 37, 37, 53 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "xvaddsp 38, 38, 54 \n\t" | |||
| "xvaddsp 39, 39, 55 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "xvabssp 52, 44 \n\t" | |||
| "xvabssp 53, 45 \n\t" | |||
| "xvabssp 54, 46 \n\t" | |||
| "xvabssp 55, 47 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "xvaddsp 36, 36, 52 \n\t" | |||
| "xvaddsp 37, 37, 53 \n\t" | |||
| "xvaddsp 38, 38, 54 \n\t" | |||
| "xvaddsp 39, 39, 55 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xvaddsp 34, 34, 35 \n\t" | |||
| "xvaddsp 36, 36, 37 \n\t" | |||
| "xvaddsp 38, 38, 39 \n\t" | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 32, 32, 36 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (svec), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,140 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| ccopy_kernel_32(n1, x, y); | |||
| i=n1; | |||
| ix=n1*2; | |||
| iy=n1*2; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[iy] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix+=2; | |||
| iy+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvw4x 50, 0, %2 \n\t" | |||
| "lxvw4x 51, %5, %2 \n\t" | |||
| "lxvw4x 52, %6, %2 \n\t" | |||
| "lxvw4x 53, %7, %2 \n\t" | |||
| "lxvw4x 54, %8, %2 \n\t" | |||
| "lxvw4x 55, %9, %2 \n\t" | |||
| "lxvw4x 56, %10, %2 \n\t" | |||
| "lxvw4x 57, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvw4x 40, 0, %1 \n\t" | |||
| "stxvw4x 41, %5, %1 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "stxvw4x 42, %6, %1 \n\t" | |||
| "stxvw4x 43, %7, %1 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "stxvw4x 44, %8, %1 \n\t" | |||
| "stxvw4x 45, %9, %1 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "stxvw4x 46, %10, %1 \n\t" | |||
| "stxvw4x 47, %11, %1 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "stxvw4x 50, 0, %1 \n\t" | |||
| "stxvw4x 51, %5, %1 \n\t" | |||
| "lxvw4x 50, 0, %2 \n\t" | |||
| "lxvw4x 51, %5, %2 \n\t" | |||
| "stxvw4x 52, %6, %1 \n\t" | |||
| "stxvw4x 53, %7, %1 \n\t" | |||
| "lxvw4x 52, %6, %2 \n\t" | |||
| "lxvw4x 53, %7, %2 \n\t" | |||
| "stxvw4x 54, %8, %1 \n\t" | |||
| "stxvw4x 55, %9, %1 \n\t" | |||
| "lxvw4x 54, %8, %2 \n\t" | |||
| "lxvw4x 55, %9, %2 \n\t" | |||
| "stxvw4x 56, %10, %1 \n\t" | |||
| "stxvw4x 57, %11, %1 \n\t" | |||
| "lxvw4x 56, %10, %2 \n\t" | |||
| "lxvw4x 57, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "stxvw4x 40, 0, %1 \n\t" | |||
| "stxvw4x 41, %5, %1 \n\t" | |||
| "stxvw4x 42, %6, %1 \n\t" | |||
| "stxvw4x 43, %7, %1 \n\t" | |||
| "stxvw4x 44, %8, %1 \n\t" | |||
| "stxvw4x 45, %9, %1 \n\t" | |||
| "stxvw4x 46, %10, %1 \n\t" | |||
| "stxvw4x 47, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvw4x 50, 0, %1 \n\t" | |||
| "stxvw4x 51, %5, %1 \n\t" | |||
| "stxvw4x 52, %6, %1 \n\t" | |||
| "stxvw4x 53, %7, %1 \n\t" | |||
| "stxvw4x 54, %8, %1 \n\t" | |||
| "stxvw4x 55, %9, %1 \n\t" | |||
| "stxvw4x 56, %10, %1 \n\t" | |||
| "stxvw4x 57, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,407 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/04 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 32000 | |||
| #define ALPHA_R_SP 296(SP) | |||
| #define ALPHA_I_SP 304(SP) | |||
| #define FZERO 312(SP) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define ALPHA_R_SP 224(SP) | |||
| #define ALPHA_I_SP 232(SP) | |||
| #define FZERO 240(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r10 | |||
| #define B r6 | |||
| #define C r7 | |||
| #define LDC r8 | |||
| #define OFFSET r9 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #define o0 0 | |||
| #define alpha_dr vs28 | |||
| #define alpha_di vs29 | |||
| #define alpha_sr vs30 | |||
| #define alpha_si vs31 | |||
| #define FRAMEPOINTER r12 | |||
| #define BBUFFER r14 | |||
| #define L r15 | |||
| #define o12 r16 | |||
| #define o4 r17 | |||
| #define T2 r19 | |||
| #define BBO r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| #endif | |||
| stfs f1, ALPHA_R_SP | |||
| stfs f2, ALPHA_I_SP | |||
| // stw r0, FZERO | |||
| #ifdef linux | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) | |||
| #else | |||
| lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, OFFSET | |||
| #endif | |||
| #endif | |||
| #include "cgemm_macros_8x4_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble L999_H1 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 384 | |||
| li o4 , 4 | |||
| li o8 , 8 | |||
| li o12 , 12 | |||
| li o16 , 16 | |||
| li o32 , 32 | |||
| li o48 , 48 | |||
| addi BBUFFER, SP, 512+4096 | |||
| li T1, -4096 | |||
| and BBUFFER, BBUFFER, T1 | |||
| #ifdef __64BIT__ | |||
| addi T1 , SP, 296 | |||
| #else | |||
| addi T1 , SP, 224 | |||
| #endif | |||
| stxsspx vs1, 0, T1 | |||
| lxsspx alpha_dr, 0, T1 | |||
| stxsspx vs2, o8 , T1 | |||
| lxsspx alpha_di, o8, T1 | |||
| addi T1, SP, 360 | |||
| li T2, 0 | |||
| stw T2, 0(T1) | |||
| stw T2, 4(T1) | |||
| stw T2, 8(T1) | |||
| stxsspx alpha_dr, o12, T1 | |||
| lxvw4x alpha_sr, o0 , T1 | |||
| addi T1, T1, 16 | |||
| stw T2, 0(T1) | |||
| stw T2, 4(T1) | |||
| stw T2, 8(T1) | |||
| stxsspx alpha_di, o12, T1 | |||
| lxvw4x alpha_si, o0 , T1 | |||
| .align 5 | |||
| #include "cgemm_logic_8x4_power8.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,175 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "cswap_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT g0, g1, g2, g3, g4, g5, g6, g7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| g0 = y1[0]; | |||
| g1 = y1[1]; | |||
| g2 = y1[2]; | |||
| g3 = y1[3]; | |||
| g4 = y1[4]; | |||
| g5 = y1[5]; | |||
| g6 = y1[6]; | |||
| g7 = y1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1[0] = g0; | |||
| x1[1] = g1; | |||
| x1[2] = g2; | |||
| x1[3] = g3; | |||
| x1[4] = g4; | |||
| x1[5] = g5; | |||
| x1[6] = g6; | |||
| x1[7] = g7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2, inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| cswap_kernel_32(n1, x, y); | |||
| i=n1; | |||
| ix = 2* n1; | |||
| iy = 2* n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp[0] = x[ix] ; | |||
| temp[1] = x[ix+1] ; | |||
| x[ix] = y[iy] ; | |||
| x[ix+1] = y[iy+1] ; | |||
| y[iy] = temp[0] ; | |||
| y[iy+1] = temp[1] ; | |||
| ix += 2 ; | |||
| iy += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| temp[0] = x[ix] ; | |||
| temp[1] = x[ix+1] ; | |||
| x[ix] = y[iy] ; | |||
| x[ix+1] = y[iy+1] ; | |||
| y[iy] = temp[0] ; | |||
| y[iy+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,180 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "addi %3, %3, -4 \n\t" | |||
| "addi %4, %4, -4 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %5, %2 \n\t" | |||
| "lxvw4x 34, %6, %2 \n\t" | |||
| "lxvw4x 35, %7, %2 \n\t" | |||
| "lxvw4x 36, %8, %2 \n\t" | |||
| "lxvw4x 37, %9, %2 \n\t" | |||
| "lxvw4x 38, %10, %2 \n\t" | |||
| "lxvw4x 39, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvw4x 48, 0, %1 \n\t" | |||
| "lxvw4x 49, %5, %1 \n\t" | |||
| "lxvw4x 50, %6, %1 \n\t" | |||
| "lxvw4x 51, %7, %1 \n\t" | |||
| "lxvw4x 52, %8, %1 \n\t" | |||
| "lxvw4x 53, %9, %1 \n\t" | |||
| "lxvw4x 54, %10, %1 \n\t" | |||
| "lxvw4x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "lxvw4x 56, 0, %1 \n\t" | |||
| "lxvw4x 57, %5, %1 \n\t" | |||
| "lxvw4x 58, %6, %1 \n\t" | |||
| "lxvw4x 59, %7, %1 \n\t" | |||
| "lxvw4x 60, %8, %1 \n\t" | |||
| "lxvw4x 61, %9, %1 \n\t" | |||
| "lxvw4x 62, %10, %1 \n\t" | |||
| "lxvw4x 63, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 40, 0, %3 \n\t" | |||
| "stxvw4x 41, %5, %3 \n\t" | |||
| "stxvw4x 42, %6, %3 \n\t" | |||
| "stxvw4x 43, %7, %3 \n\t" | |||
| "stxvw4x 44, %8, %3 \n\t" | |||
| "stxvw4x 45, %9, %3 \n\t" | |||
| "stxvw4x 46, %10, %3 \n\t" | |||
| "stxvw4x 47, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 48, 0, %4 \n\t" | |||
| "stxvw4x 49, %5, %4 \n\t" | |||
| "stxvw4x 50, %6, %4 \n\t" | |||
| "stxvw4x 51, %7, %4 \n\t" | |||
| "stxvw4x 52, %8, %4 \n\t" | |||
| "stxvw4x 53, %9, %4 \n\t" | |||
| "stxvw4x 54, %10, %4 \n\t" | |||
| "stxvw4x 55, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "stxvw4x 56, 0, %4 \n\t" | |||
| "stxvw4x 57, %5, %4 \n\t" | |||
| "stxvw4x 58, %6, %4 \n\t" | |||
| "stxvw4x 59, %7, %4 \n\t" | |||
| "stxvw4x 60, %8, %4 \n\t" | |||
| "stxvw4x 61, %9, %4 \n\t" | |||
| "stxvw4x 62, %10, %4 \n\t" | |||
| "stxvw4x 63, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y2), // 3 | |||
| "r" (x2), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,399 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/04 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 400 | |||
| #define ALPHA_R_SP 304(SP) | |||
| #define ALPHA_I_SP 312(SP) | |||
| #else | |||
| #define STACKSIZE 256 | |||
| #define ALPHA_R_SP 224(SP) | |||
| #define ALPHA_I_SP 232(SP) | |||
| #define FZERO 240(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r10 | |||
| #define B r6 | |||
| #define C r7 | |||
| #define LDC r8 | |||
| #define OFFSET r9 | |||
| #else | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #endif | |||
| #endif | |||
| #define o0 0 | |||
| #define alpha_dr vs28 | |||
| #define alpha_di vs29 | |||
| #define alpha_sr vs30 | |||
| #define alpha_si vs31 | |||
| #define o12 r12 | |||
| #define KKK r13 | |||
| #define K1 r14 | |||
| #define L r15 | |||
| #define o16 r16 | |||
| #define NOTUSED r17 | |||
| #define T2 r19 | |||
| #define KK r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o4 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T1 r31 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| std r13, 288(SP) | |||
| std r12, 296(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| stw r13, 216(SP) | |||
| #endif | |||
| stfs f1, ALPHA_R_SP | |||
| stfs f2, ALPHA_I_SP | |||
| // stw r0, FZERO | |||
| #ifdef linux | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||
| #else | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, OFFSET | |||
| #endif | |||
| #endif | |||
| #include "ctrmm_macros_8x4_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble L999_H1 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 384 | |||
| li o4 , 4 | |||
| li o8 , 8 | |||
| li o12 , 12 | |||
| li o16 , 16 | |||
| li o32 , 32 | |||
| li o48 , 48 | |||
| #ifdef __64BIT__ | |||
| addi T1, SP, 304 | |||
| #else | |||
| addi T1, SP, 224 | |||
| #endif | |||
| lxsspx alpha_dr, 0, T1 | |||
| lxsspx alpha_di, o8, T1 | |||
| addi T1, SP, 360 | |||
| li T2, 0 | |||
| stw T2, 0(T1) | |||
| stw T2, 4(T1) | |||
| stw T2, 8(T1) | |||
| stxsspx alpha_dr, o12, T1 | |||
| lxvw4x alpha_sr, o0 , T1 | |||
| addi T1, T1, 16 | |||
| stw T2, 0(T1) | |||
| stw T2, 4(T1) | |||
| stw T2, 8(T1) | |||
| stxsspx alpha_di, o12, T1 | |||
| lxvw4x alpha_si, o0 , T1 | |||
| .align 5 | |||
| #include "ctrmm_logic_8x4_power8.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r13, 288(SP) | |||
| ld r12, 296(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| lwz r13, 216(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,144 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #if defined(POWER8) | |||
| #include "dasum_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| svec[0] = sum0+sum1+sum2+sum3; | |||
| svec[1] = 0.0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| FLOAT svec[2] __attribute__ ((aligned (16)));; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dasum_kernel_16(n1, x, svec); | |||
| sumf = svec[0] + svec[1]; | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); | |||
| static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "dcbt %2 , %4 \n\t" | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2 , %4 \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "xvabsdp 52, 44 \n\t" | |||
| "xvabsdp 53, 45 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "xvabsdp 54, 46 \n\t" | |||
| "xvabsdp 55, 47 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvadddp 36, 36, 52 \n\t" | |||
| "xvadddp 37, 37, 53 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "xvadddp 38, 38, 54 \n\t" | |||
| "xvadddp 39, 39, 55 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "xvabsdp 52, 44 \n\t" | |||
| "xvabsdp 53, 45 \n\t" | |||
| "xvabsdp 54, 46 \n\t" | |||
| "xvabsdp 55, 47 \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "xvadddp 36, 36, 52 \n\t" | |||
| "xvadddp 37, 37, 53 \n\t" | |||
| "xvadddp 38, 38, 54 \n\t" | |||
| "xvadddp 39, 39, 55 \n\t" | |||
| "xvadddp 32, 32, 33 \n\t" | |||
| "xvadddp 34, 34, 35 \n\t" | |||
| "xvadddp 36, 36, 37 \n\t" | |||
| "xvadddp 38, 38, 39 \n\t" | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (svec), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "daxpy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT a = *alpha; | |||
| while(i < n) | |||
| { | |||
| y[i] += a * x[i]; | |||
| y[i+1] += a * x[i+1]; | |||
| y[i+2] += a * x[i+2]; | |||
| y[i+3] += a * x[i+3]; | |||
| y[i+4] += a * x[i+4]; | |||
| y[i+5] += a * x[i+5]; | |||
| y[i+6] += a * x[i+6]; | |||
| y[i+7] += a * x[i+7]; | |||
| i+=8 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT a2[4]; | |||
| a2[0]=da; | |||
| a2[1]=da; | |||
| a2[2]=da; | |||
| a2[3]=da; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 ) | |||
| daxpy_kernel_8(n1, x, y , a2 ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| y[i] += da * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| FLOAT m1 = da * x[ix] ; | |||
| FLOAT m2 = da * x[ix+inc_x] ; | |||
| FLOAT m3 = da * x[ix+2*inc_x] ; | |||
| FLOAT m4 = da * x[ix+3*inc_x] ; | |||
| y[iy] += m1 ; | |||
| y[iy+inc_y] += m2 ; | |||
| y[iy+2*inc_y] += m3 ; | |||
| y[iy+3*inc_y] += m4 ; | |||
| ix += inc_x*4 ; | |||
| iy += inc_y*4 ; | |||
| i+=4 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] += da * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,201 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/22 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxsdx 33, %5, %4 \n\t" | |||
| "xxspltd 32, 33, 0 \n\t" | |||
| "addi %8, %8, -8 \n\t" | |||
| "dcbt %2, %9 \n\t" | |||
| "dcbt %3, %9 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "lxvd2x 49, %5, %3 \n\t" | |||
| "lxvd2x 50, %6, %3 \n\t" | |||
| "lxvd2x 51, %7, %3 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "lxvd2x 44, 0, %2 \n\t" | |||
| "lxvd2x 45, %5, %2 \n\t" | |||
| "lxvd2x 46, %6, %2 \n\t" | |||
| "lxvd2x 47, %7, %2 \n\t" | |||
| "lxvd2x 52, 0, %3 \n\t" | |||
| "lxvd2x 53, %5, %3 \n\t" | |||
| "lxvd2x 54, %6, %3 \n\t" | |||
| "lxvd2x 55, %7, %3 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %9 \n\t" | |||
| "dcbt %3, %9 \n\t" | |||
| "xvmaddadp 48, 40, 32 \n\t" | |||
| "xvmaddadp 49, 41, 32 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "stxvd2x 48, 0, %8 \n\t" | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "xvmaddadp 50, 42, 32 \n\t" | |||
| "xvmaddadp 51, 43, 32 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "lxvd2x 49, %5, %3 \n\t" | |||
| "lxvd2x 50, %6, %3 \n\t" | |||
| "lxvd2x 51, %7, %3 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "xvmaddadp 52, 44, 32 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "xvmaddadp 53, 45, 32 \n\t" | |||
| "lxvd2x 44, 0, %2 \n\t" | |||
| "lxvd2x 45, %5, %2 \n\t" | |||
| "stxvd2x 52, 0, %8 \n\t" | |||
| "stxvd2x 53, %5, %8 \n\t" | |||
| "xvmaddadp 54, 46, 32 \n\t" | |||
| "xvmaddadp 55, 47, 32 \n\t" | |||
| "lxvd2x 46, %6, %2 \n\t" | |||
| "lxvd2x 47, %7, %2 \n\t" | |||
| "stxvd2x 54, %6, %8 \n\t" | |||
| "stxvd2x 55, %7, %8 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "lxvd2x 52, 0, %3 \n\t" | |||
| "lxvd2x 53, %5, %3 \n\t" | |||
| "lxvd2x 54, %6, %3 \n\t" | |||
| "lxvd2x 55, %7, %3 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmaddadp 48, 40, 32 \n\t" | |||
| "xvmaddadp 49, 41, 32 \n\t" | |||
| "xvmaddadp 50, 42, 32 \n\t" | |||
| "xvmaddadp 51, 43, 32 \n\t" | |||
| "xvmaddadp 52, 44, 32 \n\t" | |||
| "xvmaddadp 53, 45, 32 \n\t" | |||
| "xvmaddadp 54, 46, 32 \n\t" | |||
| "xvmaddadp 55, 47, 32 \n\t" | |||
| "stxvd2x 48, 0, %8 \n\t" | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "stxvd2x 52, 0, %8 \n\t" | |||
| "stxvd2x 53, %5, %8 \n\t" | |||
| "stxvd2x 54, %6, %8 \n\t" | |||
| "stxvd2x 55, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y1), // 3 | |||
| "r" (alpha), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (y2), // 8 | |||
| "r" (pre) // 9 | |||
| : "cr0", "%0", "%2" , "%3", "%8", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "dcopy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=8; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dcopy_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[i] = x[i] ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 50, 0, %2 \n\t" | |||
| "lxvd2x 51, %5, %2 \n\t" | |||
| "lxvd2x 52, %6, %2 \n\t" | |||
| "lxvd2x 53, %7, %2 \n\t" | |||
| "lxvd2x 54, %8, %2 \n\t" | |||
| "lxvd2x 55, %9, %2 \n\t" | |||
| "lxvd2x 56, %10, %2 \n\t" | |||
| "lxvd2x 57, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvd2x 40, 0, %1 \n\t" | |||
| "stxvd2x 41, %5, %1 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "stxvd2x 42, %6, %1 \n\t" | |||
| "stxvd2x 43, %7, %1 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "stxvd2x 44, %8, %1 \n\t" | |||
| "stxvd2x 45, %9, %1 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "stxvd2x 46, %10, %1 \n\t" | |||
| "stxvd2x 47, %11, %1 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "stxvd2x 50, 0, %1 \n\t" | |||
| "stxvd2x 51, %5, %1 \n\t" | |||
| "lxvd2x 50, 0, %2 \n\t" | |||
| "lxvd2x 51, %5, %2 \n\t" | |||
| "stxvd2x 52, %6, %1 \n\t" | |||
| "stxvd2x 53, %7, %1 \n\t" | |||
| "lxvd2x 52, %6, %2 \n\t" | |||
| "lxvd2x 53, %7, %2 \n\t" | |||
| "stxvd2x 54, %8, %1 \n\t" | |||
| "stxvd2x 55, %9, %1 \n\t" | |||
| "lxvd2x 54, %8, %2 \n\t" | |||
| "lxvd2x 55, %9, %2 \n\t" | |||
| "stxvd2x 56, %10, %1 \n\t" | |||
| "stxvd2x 57, %11, %1 \n\t" | |||
| "lxvd2x 56, %10, %2 \n\t" | |||
| "lxvd2x 57, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "stxvd2x 40, 0, %1 \n\t" | |||
| "stxvd2x 41, %5, %1 \n\t" | |||
| "stxvd2x 42, %6, %1 \n\t" | |||
| "stxvd2x 43, %7, %1 \n\t" | |||
| "stxvd2x 44, %8, %1 \n\t" | |||
| "stxvd2x 45, %9, %1 \n\t" | |||
| "stxvd2x 46, %10, %1 \n\t" | |||
| "stxvd2x 47, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvd2x 50, 0, %1 \n\t" | |||
| "stxvd2x 51, %5, %1 \n\t" | |||
| "stxvd2x 52, %6, %1 \n\t" | |||
| "stxvd2x 53, %7, %1 \n\t" | |||
| "stxvd2x 54, %8, %1 \n\t" | |||
| "stxvd2x 55, %9, %1 \n\t" | |||
| "stxvd2x 56, %10, %1 \n\t" | |||
| "stxvd2x 57, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,139 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/20 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "ddot_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| *d += dot; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 ) | |||
| ddot_kernel_8(n1, x, y , &dot ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| BLASLONG n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| FLOAT m1 = y[iy] * x[ix] ; | |||
| FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||
| FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||
| FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||
| ix += inc_x*4 ; | |||
| iy += inc_y*4 ; | |||
| temp1 += m1+m3; | |||
| temp2 += m2+m4; | |||
| i+=4 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp1 += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| dot = temp1 + temp2; | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,178 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/20 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "dcbt %2, %12 \n\t" | |||
| "dcbt %3, %12 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 49, %5, %3 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 50, %6, %3 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 51, %7, %3 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 52, %8, %3 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 53, %9, %3 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 54, %10, %3 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "lxvd2x 55, %11, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %12 \n\t" | |||
| "dcbt %3, %12 \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 48, 0, %3 \n\t" | |||
| "xvmaddadp 33, 41, 49 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 49, %5, %3 \n\t" | |||
| "xvmaddadp 34, 42, 50 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 50, %6, %3 \n\t" | |||
| "xvmaddadp 35, 43, 51 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 51, %7, %3 \n\t" | |||
| "xvmaddadp 36, 44, 52 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 52, %8, %3 \n\t" | |||
| "xvmaddadp 37, 45, 53 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 53, %9, %3 \n\t" | |||
| "xvmaddadp 38, 46, 54 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 54, %10, %3 \n\t" | |||
| "xvmaddadp 39, 47, 55 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "lxvd2x 55, %11, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" | |||
| "xvmaddadp 33, 41, 49 \n\t" | |||
| "xvmaddadp 34, 42, 50 \n\t" | |||
| "xvmaddadp 35, 43, 51 \n\t" | |||
| "xvmaddadp 36, 44, 52 \n\t" | |||
| "xvmaddadp 37, 45, 53 \n\t" | |||
| "xvmaddadp 38, 46, 54 \n\t" | |||
| "xvmaddadp 39, 47, 55 \n\t" | |||
| "xvadddp 32, 32, 33 \n\t" | |||
| "xvadddp 34, 34, 35 \n\t" | |||
| "xvadddp 36, 36, 37 \n\t" | |||
| "xvadddp 38, 38, 39 \n\t" | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| "xxswapd 33, 32 \n\t" | |||
| "xsadddp 32, 32, 33 \n\t" | |||
| "stxsdx 32, 0, %4 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y1), // 3 | |||
| "r" (dot), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112), // 11 | |||
| "r" (pre) // 12 | |||
| : "cr0", "%0", "%2" , "%3", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,426 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/30 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "dgemv_n_microk_power8.c" | |||
| #endif | |||
| #define NBMAX 4096 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| for ( i=0; i<4; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| for ( i=0; i<2; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x1 | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| a0 = ap; | |||
| for ( i=0; i<1; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0]; | |||
| y[i+1] += a0[i+1]*x[0]; | |||
| y[i+2] += a0[i+2]*x[0]; | |||
| y[i+3] += a0[i+3]*x[0]; | |||
| } | |||
| } | |||
| #endif | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| FLOAT *ap[4] __attribute__ ((aligned (16)));; | |||
| FLOAT xbuffer[8] __attribute__ ((aligned (16)));; | |||
| FLOAT alpha_r[4] __attribute__ ((aligned (16)));; | |||
| FLOAT *ybuffer; | |||
| alpha_r[0] = alpha; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*8); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,301 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/30 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i=n; | |||
| BLASLONG o8 = 8; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o24 = 24; | |||
| BLASLONG pre = 384; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT *y1=y+1; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| a0 = ap[0]+1; | |||
| a1 = ap[1]+1; | |||
| a2 = ap[2]+1; | |||
| a3 = ap[3]+1; | |||
| x[0]=xo[0] * *alpha; | |||
| x[1]=xo[1] * *alpha; | |||
| x[2]=xo[2] * *alpha; | |||
| x[3]=xo[3] * *alpha; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvdsx 32, 0 , %1 \n\t" // x0 | |||
| "lxvdsx 33,%3 , %1 \n\t" // x1 | |||
| "lxvdsx 34,%4 , %1 \n\t" // x2 | |||
| "lxvdsx 35,%5 , %1 \n\t" // x3 | |||
| "addi %2 , %2 , -8 \n\t" | |||
| "addi %6 , %6 , -8 \n\t" | |||
| "addi %7 , %7 , -8 \n\t" | |||
| "addi %8 , %8 , -8 \n\t" | |||
| "addi %9 , %9 , -8 \n\t" | |||
| "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] | |||
| "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] | |||
| "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] | |||
| "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] | |||
| "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] | |||
| "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] | |||
| "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] | |||
| "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] | |||
| "addi %6, %6, 32 \n\t" | |||
| "addi %7, %7, 32 \n\t" | |||
| "addi %8, %8, 32 \n\t" | |||
| "addi %9, %9, 32 \n\t" | |||
| "addic. %0 , %0 , -4 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %10 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "lxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "dcbt %6, %10 \n\t" | |||
| "dcbt %7, %10 \n\t" | |||
| "dcbt %8, %10 \n\t" | |||
| "dcbt %9, %10 \n\t" | |||
| "xvmaddadp 40, 48, 32 \n\t" | |||
| "xvmaddadp 41, 49, 32 \n\t" | |||
| "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] | |||
| "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] | |||
| "xvmaddadp 40, 50, 33 \n\t" | |||
| "addi %6, %6, 32 \n\t" | |||
| "xvmaddadp 41, 51, 33 \n\t" | |||
| "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] | |||
| "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] | |||
| "xvmaddadp 40, 52, 34 \n\t" | |||
| "addi %7, %7, 32 \n\t" | |||
| "xvmaddadp 41, 53, 34 \n\t" | |||
| "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] | |||
| "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] | |||
| "xvmaddadp 40, 54, 35 \n\t" | |||
| "addi %8, %8, 32 \n\t" | |||
| "xvmaddadp 41, 55, 35 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "stxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] | |||
| "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] | |||
| "addi %9, %9, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %0 , %0 , -4 \n\t" | |||
| "ble 2f \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "lxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "xvmaddadp 40, 48, 32 \n\t" | |||
| "xvmaddadp 41, 49, 32 \n\t" | |||
| "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] | |||
| "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] | |||
| "xvmaddadp 40, 50, 33 \n\t" | |||
| "addi %6, %6, 32 \n\t" | |||
| "xvmaddadp 41, 51, 33 \n\t" | |||
| "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] | |||
| "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] | |||
| "xvmaddadp 40, 52, 34 \n\t" | |||
| "addi %7, %7, 32 \n\t" | |||
| "xvmaddadp 41, 53, 34 \n\t" | |||
| "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] | |||
| "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] | |||
| "xvmaddadp 40, 54, 35 \n\t" | |||
| "addi %8, %8, 32 \n\t" | |||
| "xvmaddadp 41, 55, 35 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "stxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] | |||
| "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] | |||
| "addi %9, %9, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %0 , %0 , -4 \n\t" | |||
| "ble 2f \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "lxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "xvmaddadp 40, 48, 32 \n\t" | |||
| "xvmaddadp 41, 49, 32 \n\t" | |||
| "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] | |||
| "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] | |||
| "xvmaddadp 40, 50, 33 \n\t" | |||
| "addi %6, %6, 32 \n\t" | |||
| "xvmaddadp 41, 51, 33 \n\t" | |||
| "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] | |||
| "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] | |||
| "xvmaddadp 40, 52, 34 \n\t" | |||
| "addi %7, %7, 32 \n\t" | |||
| "xvmaddadp 41, 53, 34 \n\t" | |||
| "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] | |||
| "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] | |||
| "xvmaddadp 40, 54, 35 \n\t" | |||
| "addi %8, %8, 32 \n\t" | |||
| "xvmaddadp 41, 55, 35 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "stxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] | |||
| "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] | |||
| "addi %9, %9, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %0 , %0 , -4 \n\t" | |||
| "ble 2f \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "lxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "xvmaddadp 40, 48, 32 \n\t" | |||
| "xvmaddadp 41, 49, 32 \n\t" | |||
| "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1] | |||
| "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3] | |||
| "xvmaddadp 40, 50, 33 \n\t" | |||
| "addi %6, %6, 32 \n\t" | |||
| "xvmaddadp 41, 51, 33 \n\t" | |||
| "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1] | |||
| "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3] | |||
| "xvmaddadp 40, 52, 34 \n\t" | |||
| "addi %7, %7, 32 \n\t" | |||
| "xvmaddadp 41, 53, 34 \n\t" | |||
| "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1] | |||
| "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3] | |||
| "xvmaddadp 40, 54, 35 \n\t" | |||
| "addi %8, %8, 32 \n\t" | |||
| "xvmaddadp 41, 55, 35 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "stxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1] | |||
| "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3] | |||
| "addi %9, %9, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %0 , %0 , -4 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "lxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| "xvmaddadp 40, 48, 32 \n\t" | |||
| "xvmaddadp 41, 49, 32 \n\t" | |||
| "xvmaddadp 40, 50, 33 \n\t" | |||
| "xvmaddadp 41, 51, 33 \n\t" | |||
| "xvmaddadp 40, 52, 34 \n\t" | |||
| "xvmaddadp 41, 53, 34 \n\t" | |||
| "xvmaddadp 40, 54, 35 \n\t" | |||
| "xvmaddadp 41, 55, 35 \n\t" | |||
| "stxvd2x 40, 0, %2 \n\t" // y0, y1 | |||
| "stxvd2x 41,%4, %2 \n\t" // y2, y3 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x), // 1 | |||
| "r" (y1), // 2 | |||
| "r" (o8), // 3 | |||
| "r" (o16), // 4 | |||
| "r" (o24), // 5 | |||
| "r" (a0), // 6 | |||
| "r" (a1), // 7 | |||
| "r" (a2), // 8 | |||
| "r" (a3), // 9 | |||
| "r" (pre) // 10 | |||
| : "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,167 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) | |||
| #include "drot_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3; | |||
| FLOAT x00, x01, x02, x03; | |||
| FLOAT g0, g1, g2, g3; | |||
| FLOAT y00, y01, y02, y03; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT c1=*c; | |||
| FLOAT s1=*s; | |||
| while ( i<n ) | |||
| { | |||
| x00 = x1[0]; | |||
| y00 = y1[0]; | |||
| x01 = x1[1]; | |||
| y01 = y1[1]; | |||
| x02 = x1[2]; | |||
| y02 = y1[2]; | |||
| x03 = x1[3]; | |||
| y03 = y1[3]; | |||
| f0 = c1*x00 + s1*y00; | |||
| g0 = c1*y00 - s1*x00; | |||
| f1 = c1*x01 + s1*y01; | |||
| g1 = c1*y01 - s1*x01; | |||
| f2 = c1*x02 + s1*y02; | |||
| g2 = c1*y02 - s1*x02; | |||
| f3 = c1*x03 + s1*y03; | |||
| g3 = c1*y03 - s1*x03; | |||
| x1[0] = f0; | |||
| y1[0] = g0; | |||
| x1[1] = f1; | |||
| y1[1] = g1; | |||
| x1[2] = f2; | |||
| y1[2] = g2; | |||
| x1[3] = f3; | |||
| y1[3] = g3; | |||
| x1 += 4; | |||
| y1 += 4; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT c1[4] __attribute__ ((aligned (16)));; | |||
| FLOAT s1[4] __attribute__ ((aligned (16)));; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| c1[0]=c; | |||
| c1[1]=c; | |||
| c1[2]=c; | |||
| c1[3]=c; | |||
| s1[0]=s; | |||
| s1[1]=s; | |||
| s1[2]=s; | |||
| s1[3]=s; | |||
| drot_kernel_16(n1, x1, y1, c1, s1); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp = c*x[i] + s*y[i] ; | |||
| y[i] = c*y[i] - s*x[i] ; | |||
| x[i] = temp ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| temp = c*x[ix] + s*y[iy] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| x[ix] = temp ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,211 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * | |||
| * I don't use fused multiply-add ( precision problems with lapack ) | |||
| * | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); | |||
| static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxsdx 36 , %5, %3 \n\t" // load c | |||
| "lxsdx 37 , %5, %4 \n\t" // load s | |||
| "addi %8 , %8, -8 \n\t" | |||
| "addi %9 , %9, -8 \n\t" | |||
| "xxspltd 36 , 36, 0 \n\t" | |||
| "xxspltd 37 , 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %1 \n\t" // load x | |||
| "lxvd2x 33, %5, %1 \n\t" | |||
| "lxvd2x 34, %6, %1 \n\t" | |||
| "lxvd2x 35, %7, %1 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // load y | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "addi %1, %1, 64 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "xvmuldp 48, 32, 36 \n\t" // c * x | |||
| "xvmuldp 49, 33, 36 \n\t" | |||
| "xvmuldp 50, 34, 36 \n\t" | |||
| "xvmuldp 51, 35, 36 \n\t" | |||
| "xvmuldp 56, 40, 36 \n\t" // c * y | |||
| "xvmuldp 57, 41, 36 \n\t" | |||
| "xvmuldp 58, 42, 36 \n\t" | |||
| "xvmuldp 59, 43, 36 \n\t" | |||
| "xvmuldp 52, 32, 37 \n\t" // s * x | |||
| "xvmuldp 53, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %1 \n\t" // load x | |||
| "lxvd2x 33, %5, %1 \n\t" | |||
| "xvmuldp 54, 34, 37 \n\t" | |||
| "xvmuldp 55, 35, 37 \n\t" | |||
| "lxvd2x 34, %6, %1 \n\t" | |||
| "lxvd2x 35, %7, %1 \n\t" | |||
| "xvmuldp 60, 40, 37 \n\t" // s * y | |||
| "xvmuldp 61, 41, 37 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // load y | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "xvmuldp 62, 42, 37 \n\t" | |||
| "xvmuldp 63, 43, 37 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "xvadddp 48, 48 , 60 \n\t" // c * x + s * y | |||
| "xvadddp 49, 49 , 61 \n\t" // c * x + s * y | |||
| "addi %1, %1, 64 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "xvadddp 50, 50 , 62 \n\t" // c * x + s * y | |||
| "xvadddp 51, 51 , 63 \n\t" // c * x + s * y | |||
| "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x | |||
| "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x | |||
| "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x | |||
| "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x | |||
| "stxvd2x 48, 0, %8 \n\t" // store x | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "stxvd2x 56, 0, %9 \n\t" // store y | |||
| "stxvd2x 57, %5, %9 \n\t" | |||
| "stxvd2x 58, %6, %9 \n\t" | |||
| "stxvd2x 59, %7, %9 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "addi %9, %9, 64 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmuldp 48, 32, 36 \n\t" // c * x | |||
| "xvmuldp 49, 33, 36 \n\t" | |||
| "xvmuldp 50, 34, 36 \n\t" | |||
| "xvmuldp 51, 35, 36 \n\t" | |||
| "xvmuldp 56, 40, 36 \n\t" // c * y | |||
| "xvmuldp 57, 41, 36 \n\t" | |||
| "xvmuldp 58, 42, 36 \n\t" | |||
| "xvmuldp 59, 43, 36 \n\t" | |||
| "xvmuldp 52, 32, 37 \n\t" // s * x | |||
| "xvmuldp 53, 33, 37 \n\t" | |||
| "xvmuldp 54, 34, 37 \n\t" | |||
| "xvmuldp 55, 35, 37 \n\t" | |||
| "xvmuldp 60, 40, 37 \n\t" // s * y | |||
| "xvmuldp 61, 41, 37 \n\t" | |||
| "xvmuldp 62, 42, 37 \n\t" | |||
| "xvmuldp 63, 43, 37 \n\t" | |||
| "xvadddp 48, 48 , 60 \n\t" // c * x + s * y | |||
| "xvadddp 49, 49 , 61 \n\t" // c * x + s * y | |||
| "xvadddp 50, 50 , 62 \n\t" // c * x + s * y | |||
| "xvadddp 51, 51 , 63 \n\t" // c * x + s * y | |||
| "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x | |||
| "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x | |||
| "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x | |||
| "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x | |||
| "stxvd2x 48, 0, %8 \n\t" // store x | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "stxvd2x 56, 0, %9 \n\t" // store y | |||
| "stxvd2x 57, %5, %9 \n\t" | |||
| "stxvd2x 58, %6, %9 \n\t" | |||
| "stxvd2x 59, %7, %9 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x1), // 1 | |||
| "r" (y1), // 2 | |||
| "r" (c), // 3 | |||
| "r" (s), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (x2), // 8 | |||
| "r" (y2) // 9 | |||
| : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "dscal_microk_power8.c" | |||
| #endif | |||
| #if !defined(HAVE_KERNEL_8) | |||
| static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha=0.0; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| if ( n <= 0 || inc_x <=0 ) | |||
| return(0); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| FLOAT alpha[2]; | |||
| alpha[0]=da; | |||
| alpha[1]=da; | |||
| dscal_kernel_8_zero(n1 , alpha , x); | |||
| j=n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[j]=0.0; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| FLOAT alpha[2]; | |||
| alpha[0]=da; | |||
| alpha[1]=da; | |||
| dscal_kernel_8(n1 , alpha , x); | |||
| j=n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[j] = da * x[j] ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(j < n) | |||
| { | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,219 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); | |||
| static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *x2=x+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxsdx 33, 0, %3 \n\t" | |||
| "xxspltd 32, 33, 0 \n\t" | |||
| "addi %1, %1, -8 \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "xvmuldp 52, 44, 32 \n\t" | |||
| "xvmuldp 53, 45, 32 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "xvmuldp 54, 46, 32 \n\t" | |||
| "xvmuldp 55, 47, 32 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "stxvd2x 48, 0, %1 \n\t" | |||
| "stxvd2x 49, %5, %1 \n\t" | |||
| "stxvd2x 50, %6, %1 \n\t" | |||
| "stxvd2x 51, %7, %1 \n\t" | |||
| "stxvd2x 52, %8, %1 \n\t" | |||
| "stxvd2x 53, %9, %1 \n\t" | |||
| "stxvd2x 54, %10, %1 \n\t" | |||
| "stxvd2x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "xvmuldp 52, 44, 32 \n\t" | |||
| "xvmuldp 53, 45, 32 \n\t" | |||
| "xvmuldp 54, 46, 32 \n\t" | |||
| "xvmuldp 55, 47, 32 \n\t" | |||
| "stxvd2x 48, 0, %1 \n\t" | |||
| "stxvd2x 49, %5, %1 \n\t" | |||
| "stxvd2x 50, %6, %1 \n\t" | |||
| "stxvd2x 51, %7, %1 \n\t" | |||
| "stxvd2x 52, %8, %1 \n\t" | |||
| "stxvd2x 53, %9, %1 \n\t" | |||
| "stxvd2x 54, %10, %1 \n\t" | |||
| "stxvd2x 55, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x2), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); | |||
| static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *x2=x+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xxlxor 32 , 32 , 32 \n\t" | |||
| "addi %1, %1, -8 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvd2x 32, 0, %1 \n\t" | |||
| "stxvd2x 32, %5, %1 \n\t" | |||
| "stxvd2x 32, %6, %1 \n\t" | |||
| "stxvd2x 32, %7, %1 \n\t" | |||
| "stxvd2x 32, %8, %1 \n\t" | |||
| "stxvd2x 32, %9, %1 \n\t" | |||
| "stxvd2x 32, %10, %1 \n\t" | |||
| "stxvd2x 32, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x2), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "dswap_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT g0, g1, g2, g3, g4, g5, g6, g7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| g0 = y1[0]; | |||
| g1 = y1[1]; | |||
| g2 = y1[2]; | |||
| g3 = y1[3]; | |||
| g4 = y1[4]; | |||
| g5 = y1[5]; | |||
| g6 = y1[6]; | |||
| g7 = y1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1[0] = g0; | |||
| x1[1] = g1; | |||
| x1[2] = g2; | |||
| x1[3] = g3; | |||
| x1[4] = g4; | |||
| x1[5] = g5; | |||
| x1[6] = g6; | |||
| x1[7] = g7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=8; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| dswap_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp = y[i]; | |||
| y[i] = x[i] ; | |||
| x[i] = temp; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| temp = y[iy]; | |||
| y[iy] = x[ix] ; | |||
| x[ix] = temp; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,180 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "addi %3, %3, -8 \n\t" | |||
| "addi %4, %4, -8 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %5, %2 \n\t" | |||
| "lxvd2x 34, %6, %2 \n\t" | |||
| "lxvd2x 35, %7, %2 \n\t" | |||
| "lxvd2x 36, %8, %2 \n\t" | |||
| "lxvd2x 37, %9, %2 \n\t" | |||
| "lxvd2x 38, %10, %2 \n\t" | |||
| "lxvd2x 39, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 48, 0, %1 \n\t" | |||
| "lxvd2x 49, %5, %1 \n\t" | |||
| "lxvd2x 50, %6, %1 \n\t" | |||
| "lxvd2x 51, %7, %1 \n\t" | |||
| "lxvd2x 52, %8, %1 \n\t" | |||
| "lxvd2x 53, %9, %1 \n\t" | |||
| "lxvd2x 54, %10, %1 \n\t" | |||
| "lxvd2x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "lxvd2x 56, 0, %1 \n\t" | |||
| "lxvd2x 57, %5, %1 \n\t" | |||
| "lxvd2x 58, %6, %1 \n\t" | |||
| "lxvd2x 59, %7, %1 \n\t" | |||
| "lxvd2x 60, %8, %1 \n\t" | |||
| "lxvd2x 61, %9, %1 \n\t" | |||
| "lxvd2x 62, %10, %1 \n\t" | |||
| "lxvd2x 63, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvd2x 48, 0, %4 \n\t" | |||
| "stxvd2x 49, %5, %4 \n\t" | |||
| "stxvd2x 50, %6, %4 \n\t" | |||
| "stxvd2x 51, %7, %4 \n\t" | |||
| "stxvd2x 52, %8, %4 \n\t" | |||
| "stxvd2x 53, %9, %4 \n\t" | |||
| "stxvd2x 54, %10, %4 \n\t" | |||
| "stxvd2x 55, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "stxvd2x 56, 0, %4 \n\t" | |||
| "stxvd2x 57, %5, %4 \n\t" | |||
| "stxvd2x 58, %6, %4 \n\t" | |||
| "stxvd2x 59, %7, %4 \n\t" | |||
| "stxvd2x 60, %8, %4 \n\t" | |||
| "stxvd2x 61, %9, %4 \n\t" | |||
| "stxvd2x 62, %10, %4 \n\t" | |||
| "stxvd2x 63, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y2), // 3 | |||
| "r" (x2), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,146 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #if defined(POWER8) | |||
| #include "sasum_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=8; | |||
| } | |||
| svec[0] = sum0+sum1+sum2+sum3; | |||
| svec[1] = 0.0; | |||
| svec[2] = 0.0; | |||
| svec[3] = 0.0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| FLOAT svec[4] __attribute__ ((aligned (16)));; | |||
| BLASLONG n1; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sasum_kernel_32(n1, x, svec); | |||
| sumf = svec[0] + svec[1]+svec[2]+svec[3]; | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[i]); | |||
| i += inc_x; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); | |||
| static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "dcbt %2 , %4 \n\t" | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2 , %4 \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "xvabssp 52, 44 \n\t" | |||
| "xvabssp 53, 45 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "xvabssp 54, 46 \n\t" | |||
| "xvabssp 55, 47 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvaddsp 36, 36, 52 \n\t" | |||
| "xvaddsp 37, 37, 53 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "xvaddsp 38, 38, 54 \n\t" | |||
| "xvaddsp 39, 39, 55 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvabssp 48, 40 \n\t" | |||
| "xvabssp 49, 41 \n\t" | |||
| "xvabssp 50, 42 \n\t" | |||
| "xvabssp 51, 43 \n\t" | |||
| "xvabssp 52, 44 \n\t" | |||
| "xvabssp 53, 45 \n\t" | |||
| "xvabssp 54, 46 \n\t" | |||
| "xvabssp 55, 47 \n\t" | |||
| "xvaddsp 32, 32, 48 \n\t" | |||
| "xvaddsp 33, 33, 49 \n\t" | |||
| "xvaddsp 34, 34, 50 \n\t" | |||
| "xvaddsp 35, 35, 51 \n\t" | |||
| "xvaddsp 36, 36, 52 \n\t" | |||
| "xvaddsp 37, 37, 53 \n\t" | |||
| "xvaddsp 38, 38, 54 \n\t" | |||
| "xvaddsp 39, 39, 55 \n\t" | |||
| "xvaddsp 32, 32, 33 \n\t" | |||
| "xvaddsp 34, 34, 35 \n\t" | |||
| "xvaddsp 36, 36, 37 \n\t" | |||
| "xvaddsp 38, 38, 39 \n\t" | |||
| "xvaddsp 32, 32, 34 \n\t" | |||
| "xvaddsp 36, 36, 38 \n\t" | |||
| "xvaddsp 32, 32, 36 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (svec), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "scopy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void scopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=8; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| scopy_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[i] = x[i] ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,131 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvw4x 40, 0, %1 \n\t" | |||
| "stxvw4x 41, %5, %1 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "stxvw4x 42, %6, %1 \n\t" | |||
| "stxvw4x 43, %7, %1 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "stxvw4x 44, %8, %1 \n\t" | |||
| "stxvw4x 45, %9, %1 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "stxvw4x 46, %10, %1 \n\t" | |||
| "stxvw4x 47, %11, %1 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "stxvw4x 40, 0, %1 \n\t" | |||
| "stxvw4x 41, %5, %1 \n\t" | |||
| "stxvw4x 42, %6, %1 \n\t" | |||
| "stxvw4x 43, %7, %1 \n\t" | |||
| "stxvw4x 44, %8, %1 \n\t" | |||
| "stxvw4x 45, %9, %1 \n\t" | |||
| "stxvw4x 46, %10, %1 \n\t" | |||
| "stxvw4x 47, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,126 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "sdot_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot = 0.0; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] | |||
| + y[i+4] * x[i+4] | |||
| + y[i+5] * x[i+5] | |||
| + y[i+6] * x[i+6] | |||
| + y[i+7] * x[i+7] ; | |||
| i+=8 ; | |||
| } | |||
| *d += dot; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 ) | |||
| sdot_kernel_16(n1, x, y , &dot ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| BLASLONG n1 = n & -2; | |||
| while(i < n1) | |||
| { | |||
| dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; | |||
| ix += inc_x*2 ; | |||
| iy += inc_y*2 ; | |||
| i+=2 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| dot += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,179 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| FLOAT tempdot[4]; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "dcbt %2, %12 \n\t" | |||
| "dcbt %3, %12 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 48, 0, %3 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 49, %5, %3 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 50, %6, %3 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 51, %7, %3 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 52, %8, %3 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 53, %9, %3 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 54, %10, %3 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "lxvw4x 55, %11, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %12 \n\t" | |||
| "dcbt %3, %12 \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 48, 0, %3 \n\t" | |||
| "xvmaddasp 33, 41, 49 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 49, %5, %3 \n\t" | |||
| "xvmaddasp 34, 42, 50 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 50, %6, %3 \n\t" | |||
| "xvmaddasp 35, 43, 51 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 51, %7, %3 \n\t" | |||
| "xvmaddasp 36, 44, 52 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 52, %8, %3 \n\t" | |||
| "xvmaddasp 37, 45, 53 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 53, %9, %3 \n\t" | |||
| "xvmaddasp 38, 46, 54 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 54, %10, %3 \n\t" | |||
| "xvmaddasp 39, 47, 55 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "lxvw4x 55, %11, %3 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmaddasp 32, 40, 48 \n\t" | |||
| "xvmaddasp 33, 41, 49 \n\t" | |||
| "xvmaddasp 34, 42, 50 \n\t" | |||
| "xvmaddasp 35, 43, 51 \n\t" | |||
| "xvmaddasp 36, 44, 52 \n\t" | |||
| "xvmaddasp 37, 45, 53 \n\t" | |||
| "xvmaddasp 38, 46, 54 \n\t" | |||
| "xvmaddasp 39, 47, 55 \n\t" | |||
| "xvaddsp 32, 32 , 33 \n\t" | |||
| "xvaddsp 34, 34 , 35 \n\t" | |||
| "xvaddsp 36, 36 , 37 \n\t" | |||
| "xvaddsp 38, 38 , 39 \n\t" | |||
| "xvaddsp 32, 32 , 34 \n\t" | |||
| "xvaddsp 36, 36 , 38 \n\t" | |||
| "xvaddsp 32, 32 , 36 \n\t" | |||
| "stxvw4x 32, 0 , %4 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y1), // 3 | |||
| "r" (tempdot), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112), // 11 | |||
| "r" (pre) // 12 | |||
| : "cr0", "%0", "%2" , "%3", "memory" | |||
| ); | |||
| *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3]; | |||
| } | |||
| @@ -0,0 +1,371 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/02 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 32752 | |||
| #define ALPHA_SP 296(SP) | |||
| #define FZERO 304(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define ALPHA_SP 224(SP) | |||
| #define FZERO 232(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r7 | |||
| #define OFFSET r6 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #define alpha_r vs30 | |||
| #define alpha_vr vs31 | |||
| #define o0 0 | |||
| #define FRAMEPOINTER r12 | |||
| #define BBUFFER r14 | |||
| #define o4 r15 | |||
| #define o12 r16 | |||
| #define o8 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define BBO r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "sgemm_macros_16x8_power8.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| #endif | |||
| // stfd f1, ALPHA_SP | |||
| // stw r0, FZERO | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| slwi LDC, LDC, 2 | |||
| #if defined(TRMMKERNEL) | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble L999_H1 | |||
| li PRE, 256 | |||
| li o4 , 4 | |||
| li o8 , 8 | |||
| li o12, 12 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| addi BBUFFER, SP, 512+4096 | |||
| li T1, -4096 | |||
| and BBUFFER, BBUFFER, T1 | |||
| addi T1, SP, 300 | |||
| stxsspx f1, o0 , T1 | |||
| stxsspx f1, o4 , T1 | |||
| stxsspx f1, o8 , T1 | |||
| stxsspx f1, o12 , T1 | |||
| lxsspx alpha_r, o0, T1 | |||
| lxvw4x alpha_vr, o0, T1 | |||
| #include "sgemm_logic_16x8_power8.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,167 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/26 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) | |||
| #include "srot_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3; | |||
| FLOAT x00, x01, x02, x03; | |||
| FLOAT g0, g1, g2, g3; | |||
| FLOAT y00, y01, y02, y03; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT c1=*c; | |||
| FLOAT s1=*s; | |||
| while ( i<n ) | |||
| { | |||
| x00 = x1[0]; | |||
| y00 = y1[0]; | |||
| x01 = x1[1]; | |||
| y01 = y1[1]; | |||
| x02 = x1[2]; | |||
| y02 = y1[2]; | |||
| x03 = x1[3]; | |||
| y03 = y1[3]; | |||
| f0 = c1*x00 + s1*y00; | |||
| g0 = c1*y00 - s1*x00; | |||
| f1 = c1*x01 + s1*y01; | |||
| g1 = c1*y01 - s1*x01; | |||
| f2 = c1*x02 + s1*y02; | |||
| g2 = c1*y02 - s1*x02; | |||
| f3 = c1*x03 + s1*y03; | |||
| g3 = c1*y03 - s1*x03; | |||
| x1[0] = f0; | |||
| y1[0] = g0; | |||
| x1[1] = f1; | |||
| y1[1] = g1; | |||
| x1[2] = f2; | |||
| y1[2] = g2; | |||
| x1[3] = f3; | |||
| y1[3] = g3; | |||
| x1 += 4; | |||
| y1 += 4; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT c1[4] __attribute__ ((aligned (16)));; | |||
| FLOAT s1[4] __attribute__ ((aligned (16)));; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| c1[0]=c; | |||
| c1[1]=c; | |||
| c1[2]=c; | |||
| c1[3]=c; | |||
| s1[0]=s; | |||
| s1[1]=s; | |||
| s1[2]=s; | |||
| s1[3]=s; | |||
| srot_kernel_16(n1, x1, y1, c1, s1); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp = c*x[i] + s*y[i] ; | |||
| y[i] = c*y[i] - s*x[i] ; | |||
| x[i] = temp ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| temp = c*x[ix] + s*y[iy] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| x[ix] = temp ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,208 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * | |||
| * I don't use fused multiply-add ( precision problems with lapack ) | |||
| * | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline)); | |||
| static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvw4x 36 , 0, %3 \n\t" // load c | |||
| "lxvw4x 37 , 0, %4 \n\t" // load s | |||
| "addi %8 , %8, -4 \n\t" | |||
| "addi %9 , %9, -4 \n\t" | |||
| "lxvw4x 32, 0, %1 \n\t" // load x | |||
| "lxvw4x 33, %5, %1 \n\t" | |||
| "lxvw4x 34, %6, %1 \n\t" | |||
| "lxvw4x 35, %7, %1 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" // load y | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "addi %1, %1, 64 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "xvmulsp 48, 32, 36 \n\t" // c * x | |||
| "xvmulsp 49, 33, 36 \n\t" | |||
| "xvmulsp 50, 34, 36 \n\t" | |||
| "xvmulsp 51, 35, 36 \n\t" | |||
| "xvmulsp 56, 40, 36 \n\t" // c * y | |||
| "xvmulsp 57, 41, 36 \n\t" | |||
| "xvmulsp 58, 42, 36 \n\t" | |||
| "xvmulsp 59, 43, 36 \n\t" | |||
| "xvmulsp 52, 32, 37 \n\t" // s * x | |||
| "xvmulsp 53, 33, 37 \n\t" | |||
| "lxvw4x 32, 0, %1 \n\t" // load x | |||
| "lxvw4x 33, %5, %1 \n\t" | |||
| "xvmulsp 54, 34, 37 \n\t" | |||
| "xvmulsp 55, 35, 37 \n\t" | |||
| "lxvw4x 34, %6, %1 \n\t" | |||
| "lxvw4x 35, %7, %1 \n\t" | |||
| "xvmulsp 60, 40, 37 \n\t" // s * y | |||
| "xvmulsp 61, 41, 37 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" // load y | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "xvmulsp 62, 42, 37 \n\t" | |||
| "xvmulsp 63, 43, 37 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y | |||
| "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y | |||
| "addi %1, %1, 64 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y | |||
| "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y | |||
| "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x | |||
| "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x | |||
| "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x | |||
| "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x | |||
| "stxvw4x 48, 0, %8 \n\t" // store x | |||
| "stxvw4x 49, %5, %8 \n\t" | |||
| "stxvw4x 50, %6, %8 \n\t" | |||
| "stxvw4x 51, %7, %8 \n\t" | |||
| "stxvw4x 56, 0, %9 \n\t" // store y | |||
| "stxvw4x 57, %5, %9 \n\t" | |||
| "stxvw4x 58, %6, %9 \n\t" | |||
| "stxvw4x 59, %7, %9 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "addi %9, %9, 64 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmulsp 48, 32, 36 \n\t" // c * x | |||
| "xvmulsp 49, 33, 36 \n\t" | |||
| "xvmulsp 50, 34, 36 \n\t" | |||
| "xvmulsp 51, 35, 36 \n\t" | |||
| "xvmulsp 56, 40, 36 \n\t" // c * y | |||
| "xvmulsp 57, 41, 36 \n\t" | |||
| "xvmulsp 58, 42, 36 \n\t" | |||
| "xvmulsp 59, 43, 36 \n\t" | |||
| "xvmulsp 52, 32, 37 \n\t" // s * x | |||
| "xvmulsp 53, 33, 37 \n\t" | |||
| "xvmulsp 54, 34, 37 \n\t" | |||
| "xvmulsp 55, 35, 37 \n\t" | |||
| "xvmulsp 60, 40, 37 \n\t" // s * y | |||
| "xvmulsp 61, 41, 37 \n\t" | |||
| "xvmulsp 62, 42, 37 \n\t" | |||
| "xvmulsp 63, 43, 37 \n\t" | |||
| "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y | |||
| "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y | |||
| "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y | |||
| "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y | |||
| "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x | |||
| "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x | |||
| "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x | |||
| "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x | |||
| "stxvw4x 48, 0, %8 \n\t" // store x | |||
| "stxvw4x 49, %5, %8 \n\t" | |||
| "stxvw4x 50, %6, %8 \n\t" | |||
| "stxvw4x 51, %7, %8 \n\t" | |||
| "stxvw4x 56, 0, %9 \n\t" // store y | |||
| "stxvw4x 57, %5, %9 \n\t" | |||
| "stxvw4x 58, %6, %9 \n\t" | |||
| "stxvw4x 59, %7, %9 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x1), // 1 | |||
| "r" (y1), // 2 | |||
| "r" (c), // 3 | |||
| "r" (s), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (x2), // 8 | |||
| "r" (y2) // 9 | |||
| : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,179 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "sscal_microk_power8.c" | |||
| #endif | |||
| #if !defined(HAVE_KERNEL_16) | |||
| static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha = *da; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] *= alpha; | |||
| x[1] *= alpha; | |||
| x[2] *= alpha; | |||
| x[3] *= alpha; | |||
| x[4] *= alpha; | |||
| x[5] *= alpha; | |||
| x[6] *= alpha; | |||
| x[7] *= alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x ) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT alpha=0.0; | |||
| for( i=0; i<n; i+=8 ) | |||
| { | |||
| x[0] = alpha; | |||
| x[1] = alpha; | |||
| x[2] = alpha; | |||
| x[3] = alpha; | |||
| x[4] = alpha; | |||
| x[5] = alpha; | |||
| x[6] = alpha; | |||
| x[7] = alpha; | |||
| x+=8; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0,j=0; | |||
| FLOAT alpha[4] __attribute__ ((aligned (16)));; | |||
| if ( n <= 0 || inc_x <=0 ) | |||
| return(0); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| alpha[0]=da; | |||
| alpha[1]=da; | |||
| alpha[2]=da; | |||
| alpha[3]=da; | |||
| sscal_kernel_16_zero(n1 , alpha , x); | |||
| j=n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[j]=0.0; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| alpha[0]=da; | |||
| alpha[1]=da; | |||
| alpha[2]=da; | |||
| alpha[3]=da; | |||
| sscal_kernel_16(n1 , alpha , x); | |||
| j=n1; | |||
| } | |||
| while(j < n) | |||
| { | |||
| x[j] = da * x[j] ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( da == 0.0 ) | |||
| { | |||
| while(j < n) | |||
| { | |||
| x[i]=0.0; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(j < n) | |||
| { | |||
| x[i] = da * x[i] ; | |||
| i += inc_x ; | |||
| j++; | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -0,0 +1,218 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); | |||
| static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *x2=x+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvw4x 32, 0, %3 \n\t" | |||
| "addi %1, %1, -4 \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "xvmulsp 48, 40, 32 \n\t" | |||
| "xvmulsp 49, 41, 32 \n\t" | |||
| "lxvw4x 40, 0, %2 \n\t" | |||
| "lxvw4x 41, %5, %2 \n\t" | |||
| "xvmulsp 50, 42, 32 \n\t" | |||
| "xvmulsp 51, 43, 32 \n\t" | |||
| "lxvw4x 42, %6, %2 \n\t" | |||
| "lxvw4x 43, %7, %2 \n\t" | |||
| "xvmulsp 52, 44, 32 \n\t" | |||
| "xvmulsp 53, 45, 32 \n\t" | |||
| "lxvw4x 44, %8, %2 \n\t" | |||
| "lxvw4x 45, %9, %2 \n\t" | |||
| "xvmulsp 54, 46, 32 \n\t" | |||
| "xvmulsp 55, 47, 32 \n\t" | |||
| "lxvw4x 46, %10, %2 \n\t" | |||
| "lxvw4x 47, %11, %2 \n\t" | |||
| "stxvw4x 48, 0, %1 \n\t" | |||
| "stxvw4x 49, %5, %1 \n\t" | |||
| "stxvw4x 50, %6, %1 \n\t" | |||
| "stxvw4x 51, %7, %1 \n\t" | |||
| "stxvw4x 52, %8, %1 \n\t" | |||
| "stxvw4x 53, %9, %1 \n\t" | |||
| "stxvw4x 54, %10, %1 \n\t" | |||
| "stxvw4x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmulsp 48, 40, 32 \n\t" | |||
| "xvmulsp 49, 41, 32 \n\t" | |||
| "xvmulsp 50, 42, 32 \n\t" | |||
| "xvmulsp 51, 43, 32 \n\t" | |||
| "xvmulsp 52, 44, 32 \n\t" | |||
| "xvmulsp 53, 45, 32 \n\t" | |||
| "xvmulsp 54, 46, 32 \n\t" | |||
| "xvmulsp 55, 47, 32 \n\t" | |||
| "stxvw4x 48, 0, %1 \n\t" | |||
| "stxvw4x 49, %5, %1 \n\t" | |||
| "stxvw4x 50, %6, %1 \n\t" | |||
| "stxvw4x 51, %7, %1 \n\t" | |||
| "stxvw4x 52, %8, %1 \n\t" | |||
| "stxvw4x 53, %9, %1 \n\t" | |||
| "stxvw4x 54, %10, %1 \n\t" | |||
| "stxvw4x 55, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x2), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); | |||
| static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *x2=x+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xxlxor 32 , 32 , 32 \n\t" | |||
| "addi %1, %1, -4 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvw4x 32, 0, %1 \n\t" | |||
| "stxvw4x 32, %5, %1 \n\t" | |||
| "stxvw4x 32, %6, %1 \n\t" | |||
| "stxvw4x 32, %7, %1 \n\t" | |||
| "stxvw4x 32, %8, %1 \n\t" | |||
| "stxvw4x 32, %9, %1 \n\t" | |||
| "stxvw4x 32, %10, %1 \n\t" | |||
| "stxvw4x 32, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x2), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,154 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "sswap_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| static void sswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT g0, g1, g2, g3, g4, g5, g6, g7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| g0 = y1[0]; | |||
| g1 = y1[1]; | |||
| g2 = y1[2]; | |||
| g3 = y1[3]; | |||
| g4 = y1[4]; | |||
| g5 = y1[5]; | |||
| g6 = y1[6]; | |||
| g7 = y1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1[0] = g0; | |||
| x1[1] = g1; | |||
| x1[2] = g2; | |||
| x1[3] = g3; | |||
| x1[4] = g4; | |||
| x1[5] = g5; | |||
| x1[6] = g6; | |||
| x1[7] = g7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=8; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 > 0 ) | |||
| { | |||
| sswap_kernel_32(n1, x, y); | |||
| i=n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp = y[i]; | |||
| y[i] = x[i] ; | |||
| x[i] = temp; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| while(i < n) | |||
| { | |||
| temp = y[iy]; | |||
| y[iy] = x[ix] ; | |||
| x[ix] = temp; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,136 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_32 1 | |||
| static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "addi %3, %3, -4 \n\t" | |||
| "addi %4, %4, -4 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "lxvw4x 32, 0, %2 \n\t" | |||
| "lxvw4x 33, %5, %2 \n\t" | |||
| "lxvw4x 34, %6, %2 \n\t" | |||
| "lxvw4x 35, %7, %2 \n\t" | |||
| "lxvw4x 36, %8, %2 \n\t" | |||
| "lxvw4x 37, %9, %2 \n\t" | |||
| "lxvw4x 38, %10, %2 \n\t" | |||
| "lxvw4x 39, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvw4x 48, 0, %1 \n\t" | |||
| "lxvw4x 49, %5, %1 \n\t" | |||
| "lxvw4x 50, %6, %1 \n\t" | |||
| "lxvw4x 51, %7, %1 \n\t" | |||
| "lxvw4x 52, %8, %1 \n\t" | |||
| "lxvw4x 53, %9, %1 \n\t" | |||
| "lxvw4x 54, %10, %1 \n\t" | |||
| "lxvw4x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvw4x 32, 0, %3 \n\t" | |||
| "stxvw4x 33, %5, %3 \n\t" | |||
| "stxvw4x 34, %6, %3 \n\t" | |||
| "stxvw4x 35, %7, %3 \n\t" | |||
| "stxvw4x 36, %8, %3 \n\t" | |||
| "stxvw4x 37, %9, %3 \n\t" | |||
| "stxvw4x 38, %10, %3 \n\t" | |||
| "stxvw4x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvw4x 48, 0, %4 \n\t" | |||
| "stxvw4x 49, %5, %4 \n\t" | |||
| "stxvw4x 50, %6, %4 \n\t" | |||
| "stxvw4x 51, %7, %4 \n\t" | |||
| "stxvw4x 52, %8, %4 \n\t" | |||
| "stxvw4x 53, %9, %4 \n\t" | |||
| "stxvw4x 54, %10, %4 \n\t" | |||
| "stxvw4x 55, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "addic. %0 , %0 , -32 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y2), // 3 | |||
| "r" (x2), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,369 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/04/02 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #ifndef __64BIT__ | |||
| #define LOAD lwz | |||
| #else | |||
| #define LOAD ld | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 340 | |||
| #define ALPHA_SP 296(SP) | |||
| #define FZERO 304(SP) | |||
| #else | |||
| #define STACKSIZE 240 | |||
| #define ALPHA_SP 224(SP) | |||
| #define FZERO 232(SP) | |||
| #endif | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #ifdef linux | |||
| #ifndef __64BIT__ | |||
| #define A r6 | |||
| #define B r7 | |||
| #define C r8 | |||
| #define LDC r9 | |||
| #define OFFSET r10 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r7 | |||
| #define OFFSET r6 | |||
| #else | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #endif | |||
| #endif | |||
| #define alpha_r vs30 | |||
| #define alpha_vr vs31 | |||
| #define o0 0 | |||
| #define TBUFFER r13 | |||
| #define o12 r14 | |||
| #define o4 r15 | |||
| #define K1 r16 | |||
| #define o8 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define KK r20 | |||
| #define KKK r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "strmm_macros_16x8_power8.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| std r13, 288(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| stw r29, 152(SP) | |||
| stw r28, 156(SP) | |||
| stw r27, 160(SP) | |||
| stw r26, 164(SP) | |||
| stw r25, 168(SP) | |||
| stw r24, 172(SP) | |||
| stw r23, 176(SP) | |||
| stw r22, 180(SP) | |||
| stw r21, 184(SP) | |||
| stw r20, 188(SP) | |||
| stw r19, 192(SP) | |||
| stw r18, 196(SP) | |||
| stw r17, 200(SP) | |||
| stw r16, 204(SP) | |||
| stw r15, 208(SP) | |||
| stw r14, 212(SP) | |||
| stw r13, 216(SP) | |||
| #endif | |||
| // stfd f1, ALPHA_SP | |||
| // stw r0, FZERO | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #if !defined(__64BIT__) && defined(DOUBLE) | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #endif | |||
| mr KK, OFFSET | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| neg KK, KK | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble L999_H1 | |||
| li PRE, 256 | |||
| li o4 , 4 | |||
| li o8 , 8 | |||
| li o12, 12 | |||
| li o16, 16 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| addi TBUFFER, SP, 320 | |||
| addi T1, SP, 300 | |||
| stxsspx f1, o0 , T1 | |||
| stxsspx f1, o4 , T1 | |||
| stxsspx f1, o8 , T1 | |||
| stxsspx f1, o12 , T1 | |||
| lxsspx alpha_r, o0, T1 | |||
| lxvw4x alpha_vr, o0, T1 | |||
| #include "strmm_logic_16x8_power8.S" | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| #ifdef __64BIT__ | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r13, 288(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| lwz r29, 152(SP) | |||
| lwz r28, 156(SP) | |||
| lwz r27, 160(SP) | |||
| lwz r26, 164(SP) | |||
| lwz r25, 168(SP) | |||
| lwz r24, 172(SP) | |||
| lwz r23, 176(SP) | |||
| lwz r22, 180(SP) | |||
| lwz r21, 184(SP) | |||
| lwz r20, 188(SP) | |||
| lwz r19, 192(SP) | |||
| lwz r18, 196(SP) | |||
| lwz r17, 200(SP) | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| lwz r13, 216(SP) | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -0,0 +1,149 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #if defined(DOUBLE) | |||
| #define ABS fabs | |||
| #else | |||
| #define ABS fabsf | |||
| #endif | |||
| #if defined(POWER8) | |||
| #include "zasum_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x = x1; | |||
| FLOAT temp0, temp1, temp2, temp3; | |||
| FLOAT temp4, temp5, temp6, temp7; | |||
| FLOAT sum0 = 0.0; | |||
| FLOAT sum1 = 0.0; | |||
| FLOAT sum2 = 0.0; | |||
| FLOAT sum3 = 0.0; | |||
| while ( i< n ) | |||
| { | |||
| temp0 = ABS(x[0]); | |||
| temp1 = ABS(x[1]); | |||
| temp2 = ABS(x[2]); | |||
| temp3 = ABS(x[3]); | |||
| temp4 = ABS(x[4]); | |||
| temp5 = ABS(x[5]); | |||
| temp6 = ABS(x[6]); | |||
| temp7 = ABS(x[7]); | |||
| sum0 += temp0; | |||
| sum1 += temp1; | |||
| sum2 += temp2; | |||
| sum3 += temp3; | |||
| sum0 += temp4; | |||
| sum1 += temp5; | |||
| sum2 += temp6; | |||
| sum3 += temp7; | |||
| x+=8; | |||
| i+=4; | |||
| } | |||
| svec[0] = sum0+sum1+sum2+sum3; | |||
| svec[1] = 0.0; | |||
| } | |||
| #endif | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ip=0; | |||
| FLOAT sumf = 0.0; | |||
| FLOAT svec[2] __attribute__ ((aligned (16)));; | |||
| BLASLONG n1; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| zasum_kernel_8(n1, x, svec); | |||
| sumf = svec[0] + svec[1]; | |||
| i=n1; | |||
| ip=2*n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||
| i++; | |||
| ip+=2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2* inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += ABS(x[ip]) + ABS(x[ip+1]); | |||
| ip+=inc_x2; | |||
| i++; | |||
| } | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,177 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/28 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); | |||
| static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "dcbt %2 , %4 \n\t" | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2 , %4 \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "xvabsdp 52, 44 \n\t" | |||
| "xvabsdp 53, 45 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "xvabsdp 54, 46 \n\t" | |||
| "xvabsdp 55, 47 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "xvadddp 36, 36, 52 \n\t" | |||
| "xvadddp 37, 37, 53 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "xvadddp 38, 38, 54 \n\t" | |||
| "xvadddp 39, 39, 55 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvabsdp 48, 40 \n\t" | |||
| "xvabsdp 49, 41 \n\t" | |||
| "xvabsdp 50, 42 \n\t" | |||
| "xvabsdp 51, 43 \n\t" | |||
| "xvabsdp 52, 44 \n\t" | |||
| "xvabsdp 53, 45 \n\t" | |||
| "xvabsdp 54, 46 \n\t" | |||
| "xvabsdp 55, 47 \n\t" | |||
| "xvadddp 32, 32, 48 \n\t" | |||
| "xvadddp 33, 33, 49 \n\t" | |||
| "xvadddp 34, 34, 50 \n\t" | |||
| "xvadddp 35, 35, 51 \n\t" | |||
| "xvadddp 36, 36, 52 \n\t" | |||
| "xvadddp 37, 37, 53 \n\t" | |||
| "xvadddp 38, 38, 54 \n\t" | |||
| "xvadddp 39, 39, 55 \n\t" | |||
| "xvadddp 32, 32, 33 \n\t" | |||
| "xvadddp 34, 34, 35 \n\t" | |||
| "xvadddp 36, 36, 37 \n\t" | |||
| "xvadddp 38, 38, 39 \n\t" | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (svec), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,140 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "zaxpy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4 | |||
| static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register ix = 0; | |||
| FLOAT da_r = alpha[0]; | |||
| FLOAT da_i = alpha[1]; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; | |||
| y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; | |||
| y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; | |||
| #endif | |||
| ix+=4 ; | |||
| i+=2 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT da[4]; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 ) | |||
| { | |||
| da[0] = da_r; | |||
| da[1] = da_r; | |||
| da[2] = da_i; | |||
| da[3] = da_i; | |||
| zaxpy_kernel_4(n1, x, y , da ); | |||
| ix = 2 * n1; | |||
| } | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| i++ ; | |||
| ix += 2; | |||
| } | |||
| return(0); | |||
| } | |||
| inc_x *=2; | |||
| inc_y *=2; | |||
| while(i < n) | |||
| { | |||
| #if !defined(CONJ) | |||
| y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; | |||
| y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; | |||
| #else | |||
| y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; | |||
| y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,250 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/23 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_4 1 | |||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| #if !defined(CONJ) | |||
| FLOAT mvec[2] = { -1.0, 1.0 }; | |||
| #else | |||
| FLOAT mvec[2] = { 1.0, -1.0 }; | |||
| #endif | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxsdx 34, 0 , %4 \n\t" // alpha_r | |||
| "lxsdx 35, %5, %4 \n\t" // alpha_i | |||
| "xxspltd 32, 34, 0 \n\t" | |||
| "xxspltd 33, 35, 0 \n\t" | |||
| "lxvd2x 36, 0, %9 \n\t" // mvec | |||
| #if !defined(CONJ) | |||
| "xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec | |||
| #else | |||
| "xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec | |||
| #endif | |||
| "addi %8, %8, -8 \n\t" | |||
| "dcbt %2, %10 \n\t" | |||
| "dcbt %3, %10 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // x0 | |||
| "lxvd2x 41, %5, %2 \n\t" // x1 | |||
| "lxvd2x 42, %6, %2 \n\t" // x2 | |||
| "lxvd2x 43, %7, %2 \n\t" // x3 | |||
| "lxvd2x 48, 0, %3 \n\t" // y0 | |||
| "lxvd2x 49, %5, %3 \n\t" // y1 | |||
| "lxvd2x 50, %6, %3 \n\t" // y2 | |||
| "lxvd2x 51, %7, %3 \n\t" // y3 | |||
| "xxswapd 56, 40 \n\t" // exchange real and imag part | |||
| "xxswapd 57, 41 \n\t" // exchange real and imag part | |||
| "xxswapd 58, 42 \n\t" // exchange real and imag part | |||
| "xxswapd 59, 43 \n\t" // exchange real and imag part | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "lxvd2x 44, 0, %2 \n\t" // x4 | |||
| "lxvd2x 45, %5, %2 \n\t" // x5 | |||
| "lxvd2x 46, %6, %2 \n\t" // x6 | |||
| "lxvd2x 47, %7, %2 \n\t" // x7 | |||
| "lxvd2x 52, 0, %3 \n\t" // y4 | |||
| "lxvd2x 53, %5, %3 \n\t" // y5 | |||
| "lxvd2x 54, %6, %3 \n\t" // y6 | |||
| "lxvd2x 55, %7, %3 \n\t" // y7 | |||
| "xxswapd 60, 44 \n\t" // exchange real and imag part | |||
| "xxswapd 61, 45 \n\t" // exchange real and imag part | |||
| "xxswapd 62, 46 \n\t" // exchange real and imag part | |||
| "xxswapd 63, 47 \n\t" // exchange real and imag part | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %10 \n\t" | |||
| "dcbt %3, %10 \n\t" | |||
| "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i | |||
| "xvmaddadp 49, 41, 32 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // x0 | |||
| "lxvd2x 41, %5, %2 \n\t" // x1 | |||
| "xvmaddadp 50, 42, 32 \n\t" | |||
| "xvmaddadp 51, 43, 32 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" // x2 | |||
| "lxvd2x 43, %7, %2 \n\t" // x3 | |||
| "xvmaddadp 52, 44, 32 \n\t" | |||
| "addi %2, %2, 64 \n\t" | |||
| "xvmaddadp 53, 45, 32 \n\t" | |||
| "lxvd2x 44, 0, %2 \n\t" // x4 | |||
| "lxvd2x 45, %5, %2 \n\t" // x5 | |||
| "xvmaddadp 54, 46, 32 \n\t" | |||
| "xvmaddadp 55, 47, 32 \n\t" | |||
| "lxvd2x 46, %6, %2 \n\t" // x6 | |||
| "lxvd2x 47, %7, %2 \n\t" // x7 | |||
| "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r | |||
| "addi %2, %2, 64 \n\t" | |||
| "xvmaddadp 49, 57, 33 \n\t" | |||
| "xvmaddadp 50, 58, 33 \n\t" | |||
| "xvmaddadp 51, 59, 33 \n\t" | |||
| "xvmaddadp 52, 60, 33 \n\t" | |||
| "xvmaddadp 53, 61, 33 \n\t" | |||
| "xvmaddadp 54, 62, 33 \n\t" | |||
| "xvmaddadp 55, 63, 33 \n\t" | |||
| "stxvd2x 48, 0, %8 \n\t" | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "stxvd2x 52, 0, %8 \n\t" | |||
| "stxvd2x 53, %5, %8 \n\t" | |||
| "stxvd2x 54, %6, %8 \n\t" | |||
| "stxvd2x 55, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "xxswapd 56, 40 \n\t" // exchange real and imag part | |||
| "xxswapd 57, 41 \n\t" // exchange real and imag part | |||
| "lxvd2x 48, 0, %3 \n\t" // y0 | |||
| "lxvd2x 49, %5, %3 \n\t" // y1 | |||
| "xxswapd 58, 42 \n\t" // exchange real and imag part | |||
| "xxswapd 59, 43 \n\t" // exchange real and imag part | |||
| "lxvd2x 50, %6, %3 \n\t" // y2 | |||
| "lxvd2x 51, %7, %3 \n\t" // y3 | |||
| "xxswapd 60, 44 \n\t" // exchange real and imag part | |||
| "addi %3, %3, 64 \n\t" | |||
| "xxswapd 61, 45 \n\t" // exchange real and imag part | |||
| "lxvd2x 52, 0, %3 \n\t" // y4 | |||
| "lxvd2x 53, %5, %3 \n\t" // y5 | |||
| "xxswapd 62, 46 \n\t" // exchange real and imag part | |||
| "xxswapd 63, 47 \n\t" // exchange real and imag part | |||
| "lxvd2x 54, %6, %3 \n\t" // y6 | |||
| "lxvd2x 55, %7, %3 \n\t" // y7 | |||
| "addi %3, %3, 64 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i | |||
| "xvmaddadp 49, 41, 32 \n\t" | |||
| "xvmaddadp 50, 42, 32 \n\t" | |||
| "xvmaddadp 51, 43, 32 \n\t" | |||
| "xvmaddadp 52, 44, 32 \n\t" | |||
| "xvmaddadp 53, 45, 32 \n\t" | |||
| "xvmaddadp 54, 46, 32 \n\t" | |||
| "xvmaddadp 55, 47, 32 \n\t" | |||
| "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r | |||
| "xvmaddadp 49, 57, 33 \n\t" | |||
| "xvmaddadp 50, 58, 33 \n\t" | |||
| "xvmaddadp 51, 59, 33 \n\t" | |||
| "xvmaddadp 52, 60, 33 \n\t" | |||
| "xvmaddadp 53, 61, 33 \n\t" | |||
| "xvmaddadp 54, 62, 33 \n\t" | |||
| "xvmaddadp 55, 63, 33 \n\t" | |||
| "stxvd2x 48, 0, %8 \n\t" | |||
| "stxvd2x 49, %5, %8 \n\t" | |||
| "stxvd2x 50, %6, %8 \n\t" | |||
| "stxvd2x 51, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| "stxvd2x 52, 0, %8 \n\t" | |||
| "stxvd2x 53, %5, %8 \n\t" | |||
| "stxvd2x 54, %6, %8 \n\t" | |||
| "stxvd2x 55, %7, %8 \n\t" | |||
| "addi %8, %8, 64 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y1), // 3 | |||
| "r" (alpha), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (y2), // 8 | |||
| "r" (mvec), // 9 | |||
| "r" (pre) // 10 | |||
| : "cr0", "%0", "%2" , "%3", "%8", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,140 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "zcopy_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| zcopy_kernel_16(n1, x, y); | |||
| i=n1; | |||
| ix=n1*2; | |||
| iy=n1*2; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[iy] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix+=2; | |||
| iy+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| BLASLONG inc_x2 = 2 * inc_x; | |||
| BLASLONG inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| y[iy] = x[ix] ; | |||
| y[iy+1] = x[ix+1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 50, 0, %2 \n\t" | |||
| "lxvd2x 51, %5, %2 \n\t" | |||
| "lxvd2x 52, %6, %2 \n\t" | |||
| "lxvd2x 53, %7, %2 \n\t" | |||
| "lxvd2x 54, %8, %2 \n\t" | |||
| "lxvd2x 55, %9, %2 \n\t" | |||
| "lxvd2x 56, %10, %2 \n\t" | |||
| "lxvd2x 57, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "stxvd2x 40, 0, %1 \n\t" | |||
| "stxvd2x 41, %5, %1 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "stxvd2x 42, %6, %1 \n\t" | |||
| "stxvd2x 43, %7, %1 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "stxvd2x 44, %8, %1 \n\t" | |||
| "stxvd2x 45, %9, %1 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "stxvd2x 46, %10, %1 \n\t" | |||
| "stxvd2x 47, %11, %1 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "stxvd2x 50, 0, %1 \n\t" | |||
| "stxvd2x 51, %5, %1 \n\t" | |||
| "lxvd2x 50, 0, %2 \n\t" | |||
| "lxvd2x 51, %5, %2 \n\t" | |||
| "stxvd2x 52, %6, %1 \n\t" | |||
| "stxvd2x 53, %7, %1 \n\t" | |||
| "lxvd2x 52, %6, %2 \n\t" | |||
| "lxvd2x 53, %7, %2 \n\t" | |||
| "stxvd2x 54, %8, %1 \n\t" | |||
| "stxvd2x 55, %9, %1 \n\t" | |||
| "lxvd2x 54, %8, %2 \n\t" | |||
| "lxvd2x 55, %9, %2 \n\t" | |||
| "stxvd2x 56, %10, %1 \n\t" | |||
| "stxvd2x 57, %11, %1 \n\t" | |||
| "lxvd2x 56, %10, %2 \n\t" | |||
| "lxvd2x 57, %11, %2 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "stxvd2x 40, 0, %1 \n\t" | |||
| "stxvd2x 41, %5, %1 \n\t" | |||
| "stxvd2x 42, %6, %1 \n\t" | |||
| "stxvd2x 43, %7, %1 \n\t" | |||
| "stxvd2x 44, %8, %1 \n\t" | |||
| "stxvd2x 45, %9, %1 \n\t" | |||
| "stxvd2x 46, %10, %1 \n\t" | |||
| "stxvd2x 47, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvd2x 50, 0, %1 \n\t" | |||
| "stxvd2x 51, %5, %1 \n\t" | |||
| "stxvd2x 52, %6, %1 \n\t" | |||
| "stxvd2x 53, %7, %1 \n\t" | |||
| "stxvd2x 54, %8, %1 \n\t" | |||
| "stxvd2x 55, %9, %1 \n\t" | |||
| "stxvd2x 56, %10, %1 \n\t" | |||
| "stxvd2x 57, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,167 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <complex.h> | |||
| #if defined(POWER8) | |||
| #include "zdot_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| BLASLONG j=0; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[2] += x[j] * y[j+1] ; | |||
| dot[3] += x[j+1] * y[j] ; | |||
| dot[0] += x[j+2] * y[j+2] ; | |||
| dot[1] += x[j+3] * y[j+3] ; | |||
| dot[2] += x[j+2] * y[j+3] ; | |||
| dot[3] += x[j+3] * y[j+2] ; | |||
| dot[0] += x[j+4] * y[j+4] ; | |||
| dot[1] += x[j+5] * y[j+5] ; | |||
| dot[2] += x[j+4] * y[j+5] ; | |||
| dot[3] += x[j+5] * y[j+4] ; | |||
| dot[0] += x[j+6] * y[j+6] ; | |||
| dot[1] += x[j+7] * y[j+7] ; | |||
| dot[2] += x[j+6] * y[j+7] ; | |||
| dot[3] += x[j+7] * y[j+6] ; | |||
| j+=8; | |||
| i+=4; | |||
| } | |||
| d[0] = dot[0]; | |||
| d[1] = dot[1]; | |||
| d[2] = dot[2]; | |||
| d[3] = dot[3]; | |||
| } | |||
| #endif | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| FLOAT _Complex result; | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| __real__ result = 0.0 ; | |||
| __imag__ result = 0.0 ; | |||
| return(result); | |||
| } | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 ) | |||
| zdot_kernel_8(n1, x, y , dot ); | |||
| i = n1; | |||
| BLASLONG j = i * 2; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[2] += x[j] * y[j+1] ; | |||
| dot[3] += x[j+1] * y[j] ; | |||
| j+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| i=0; | |||
| ix=0; | |||
| iy=0; | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| while(i < n) | |||
| { | |||
| dot[0] += x[ix] * y[iy] ; | |||
| dot[1] += x[ix+1] * y[iy+1] ; | |||
| dot[2] += x[ix] * y[iy+1] ; | |||
| dot[3] += x[ix+1] * y[iy] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| #if !defined(CONJ) | |||
| __real__ result = dot[0] - dot[1]; | |||
| __imag__ result = dot[2] + dot[3]; | |||
| #else | |||
| __real__ result = dot[0] + dot[1]; | |||
| __imag__ result = dot[2] - dot[3]; | |||
| #endif | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,219 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xxlxor 32,32,32 \n\t" | |||
| "xxlxor 33,33,33 \n\t" | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,35,35 \n\t" | |||
| "xxlxor 36,36,36 \n\t" | |||
| "xxlxor 37,37,37 \n\t" | |||
| "xxlxor 38,38,38 \n\t" | |||
| "xxlxor 39,39,39 \n\t" | |||
| "dcbt %2, %8 \n\t" | |||
| "dcbt %3, %8 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||
| "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i | |||
| "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i | |||
| "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i | |||
| "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i | |||
| "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i | |||
| "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i | |||
| "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i | |||
| "xxswapd 52,48 \n\t" // y0_i, y0_r | |||
| "xxswapd 53,49 \n\t" // y1_i, y1_r | |||
| "xxswapd 54,50 \n\t" // y2_i, y2_r | |||
| "xxswapd 55,51 \n\t" // y3_i, y3_r | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i | |||
| "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i | |||
| "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i | |||
| "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i | |||
| "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i | |||
| "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i | |||
| "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i | |||
| "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i | |||
| "xxswapd 60,56 \n\t" // y0_i, y0_r | |||
| "xxswapd 61,57 \n\t" // y1_i, y1_r | |||
| "xxswapd 62,58 \n\t" // y2_i, y2_r | |||
| "xxswapd 63,59 \n\t" // y3_i, y3_r | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %8 \n\t" | |||
| "dcbt %3, %8 \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i | |||
| "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i | |||
| "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i | |||
| "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i | |||
| "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||
| "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i | |||
| "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i | |||
| "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i | |||
| "xxswapd 52,48 \n\t" // y0_i, y0_r | |||
| "xxswapd 53,49 \n\t" // y1_i, y1_r | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "xxswapd 54,50 \n\t" // y2_i, y2_r | |||
| "xxswapd 55,51 \n\t" // y3_i, y3_r | |||
| "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i | |||
| "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i | |||
| "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i | |||
| "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i | |||
| "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i | |||
| "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i | |||
| "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i | |||
| "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i | |||
| "xxswapd 60,56 \n\t" // y0_i, y0_r | |||
| "xxswapd 61,57 \n\t" // y1_i, y1_r | |||
| "addi %2, %2, 64 \n\t" | |||
| "addi %3, %3, 64 \n\t" | |||
| "xxswapd 62,58 \n\t" // y2_i, y2_r | |||
| "xxswapd 63,59 \n\t" // y3_i, y3_r | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i | |||
| "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i | |||
| "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i | |||
| "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i | |||
| "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||
| "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||
| "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||
| "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||
| "xvadddp 32, 32, 34 \n\t" | |||
| "xvadddp 36, 36, 38 \n\t" | |||
| "xvadddp 33, 33, 35 \n\t" | |||
| "xvadddp 37, 37, 39 \n\t" | |||
| "xvadddp 32, 32, 36 \n\t" | |||
| "xvadddp 33, 33, 37 \n\t" | |||
| "stxvd2x 32, 0, %4 \n\t" | |||
| "stxvd2x 33, %5, %4 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y1), // 3 | |||
| "r" (dot), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (pre) // 8 | |||
| : "cr0", "%0", "%2" , "%3", "memory" | |||
| ); | |||
| } | |||
| @@ -1,38 +1,3 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| @@ -82,7 +47,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef __64BIT__ | |||
| #define STACKSIZE 320 | |||
| #define STACKSIZE 32000 | |||
| #define ALPHA_R_SP 296(SP) | |||
| #define ALPHA_I_SP 304(SP) | |||
| #define FZERO 312(SP) | |||
| @@ -133,11 +98,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define alpha_r vs30 | |||
| #define alpha_i vs31 | |||
| #define FRAMEPOINTER r12 | |||
| #define BBUFFER r14 | |||
| #define L r15 | |||
| #define ALPHA r16 | |||
| #define o24 r17 | |||
| #define T2 r19 | |||
| #define KK r20 | |||
| #define BBO r20 | |||
| #define o8 r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| @@ -156,8 +126,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| mr FRAMEPOINTER, SP | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| @@ -200,6 +174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| #else | |||
| stw r31, 144(SP) | |||
| stw r30, 148(SP) | |||
| @@ -226,37 +201,37 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef linux | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz B, FRAMESLOT(0) + STACKSIZE(SP) | |||
| lwz C, FRAMESLOT(1) + STACKSIZE(SP) | |||
| lwz LDC, FRAMESLOT(2) + STACKSIZE(SP) | |||
| lwz B, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| lwz C, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| lwz LDC, FRAMESLOT(2) + 0(FRAMEPOINTER) | |||
| #else | |||
| lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| lwz LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| #ifdef TRMMKERNEL | |||
| #if defined(linux) && defined(__64BIT__) | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #if defined(_AIX) || defined(__APPLE__) | |||
| #ifdef __64BIT__ | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #else | |||
| #ifdef DOUBLE | |||
| lwz OFFSET, FRAMESLOT(3) + STACKSIZE(SP) | |||
| lwz OFFSET, FRAMESLOT(3) + 0(FRAMEPOINTER) | |||
| #else | |||
| lwz OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| lwz OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) | |||
| #endif | |||
| #endif | |||
| #endif | |||
| @@ -268,34 +243,38 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "zgemm_macros_8x2_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble .L999 | |||
| ble L999 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999 | |||
| ble L999 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999 | |||
| ble L999 | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| li PRE, 256 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16 , 16 | |||
| li o24 , 24 | |||
| li o32 , 32 | |||
| li o48 , 48 | |||
| addi BBUFFER, SP, 512+4096 | |||
| li T1, -4096 | |||
| and BBUFFER, BBUFFER, T1 | |||
| #ifdef __64BIT__ | |||
| addi ALPHA, SP, 296 | |||
| #else | |||
| addi ALPHA, SP, 224 | |||
| #endif | |||
| lxvdsx alpha_r, 0, ALPHA | |||
| lxvdsx alpha_i, o8, ALPHA | |||
| lxsdx alpha_r, 0, ALPHA | |||
| lxsdx alpha_i, o8, ALPHA | |||
| .align 5 | |||
| .align 4 | |||
| #include "zgemm_logic_8x2_power8.S" | |||
| .L999: | |||
| L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| @@ -339,6 +318,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| #else | |||
| lwz r31, 144(SP) | |||
| lwz r30, 148(SP) | |||
| @@ -360,6 +340,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| @@ -1,83 +1,111 @@ | |||
| srawi. J, N, 1 | |||
| ble .LZGEMM_L2_END | |||
| ble ZGEMM_L2_END | |||
| ZGEMM_L2_BEGIN: | |||
| mr BO, B | |||
| mr BBO, BBUFFER | |||
| slwi T1, K, 1 | |||
| ZGEMM_L2_COPYB: | |||
| lxvdsx vs4, o0, BO // b0_r | |||
| lxvdsx vs5, o8, BO // b0_i | |||
| addi BO, BO, 16 | |||
| stxvd2x vs4, o0, BBO | |||
| stxvd2x vs5, o16, BBO | |||
| addic. T1, T1, -1 | |||
| addi BBO, BBO, 32 | |||
| bge ZGEMM_L2_COPYB | |||
| .LZGEMM_L2_BEGIN: | |||
| mr CO, C | |||
| mr AO, A | |||
| slwi T1, LDC , 1 | |||
| add C, C, T1 | |||
| srawi. I, M, 3 | |||
| ble .LZGEMM_L2x8_END | |||
| ble ZGEMM_L2x8_END | |||
| .LZGEMM_L2x8_BEGIN: | |||
| ZGEMM_L2x8_BEGIN: | |||
| mr BO, B | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x8_SUB0 | |||
| ble ZGEMM_L2x8_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x8_SUB4 | |||
| ble ZGEMM_L2x8_SUB4 | |||
| .LZGEMM_L2x8_LOOP_START: | |||
| ZGEMM_L2x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| LOAD2x8_1 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_I1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x8_LOOP_END | |||
| ble ZGEMM_L2x8_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x8_LOOP: | |||
| ZGEMM_L2x8_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x8_LOOP | |||
| bgt ZGEMM_L2x8_LOOP | |||
| .LZGEMM_L2x8_LOOP_END: | |||
| ZGEMM_L2x8_LOOP_END: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| KERNEL2x8_1 | |||
| dcbt AO, PRE | |||
| dcbt BO, PRE | |||
| KERNEL2x8_2 | |||
| dcbt AO, PRE | |||
| @@ -88,9 +116,9 @@ | |||
| KERNEL2x8_1 | |||
| KERNEL2x8_E2 | |||
| b .LZGEMM_L2x8_SUB1 | |||
| b ZGEMM_L2x8_SUB1 | |||
| .LZGEMM_L2x8_SUB4: | |||
| ZGEMM_L2x8_SUB4: | |||
| dcbt AO, PRE | |||
| KERNEL2x8_SUBI1 | |||
| @@ -106,53 +134,53 @@ | |||
| KERNEL2x8_SUB1 | |||
| KERNEL2x8_SUB1 | |||
| b .LZGEMM_L2x8_SUB1 | |||
| b ZGEMM_L2x8_SUB1 | |||
| .LZGEMM_L2x8_SUB0: | |||
| ZGEMM_L2x8_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x8_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x8_SAVE | |||
| b .LZGEMM_L2x8_SUB2 | |||
| ble ZGEMM_L2x8_SAVE | |||
| b ZGEMM_L2x8_SUB2 | |||
| .LZGEMM_L2x8_SUB1: | |||
| ZGEMM_L2x8_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x8_SAVE | |||
| ble ZGEMM_L2x8_SAVE | |||
| .LZGEMM_L2x8_SUB2: | |||
| ZGEMM_L2x8_SUB2: | |||
| KERNEL2x8_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x8_SUB2 | |||
| bgt ZGEMM_L2x8_SUB2 | |||
| .LZGEMM_L2x8_SAVE: | |||
| ZGEMM_L2x8_SAVE: | |||
| SAVE2x8 | |||
| addic. I, I, -1 | |||
| bgt .LZGEMM_L2x8_BEGIN | |||
| bgt ZGEMM_L2x8_BEGIN | |||
| .LZGEMM_L2x8_END: | |||
| ZGEMM_L2x8_END: | |||
| .LZGEMM_L2x4_BEGIN: | |||
| ZGEMM_L2x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble .LZGEMM_L2x1_END | |||
| ble ZGEMM_L2x1_END | |||
| andi. T1, M, 4 | |||
| ble .LZGEMM_L2x4_END | |||
| mr BO, B | |||
| ble ZGEMM_L2x4_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x4_SUB0 | |||
| ble ZGEMM_L2x4_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x4_SUB4 | |||
| ble ZGEMM_L2x4_SUB4 | |||
| .LZGEMM_L2x4_LOOP_START: | |||
| ZGEMM_L2x4_LOOP_START: | |||
| LOAD2x4_1 | |||
| KERNEL2x4_I1 | |||
| @@ -166,11 +194,11 @@ | |||
| KERNEL2x4_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x4_LOOP_END | |||
| ble ZGEMM_L2x4_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x4_LOOP: | |||
| ZGEMM_L2x4_LOOP: | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| @@ -183,9 +211,9 @@ | |||
| KERNEL2x4_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x4_LOOP | |||
| bgt ZGEMM_L2x4_LOOP | |||
| .LZGEMM_L2x4_LOOP_END: | |||
| ZGEMM_L2x4_LOOP_END: | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_2 | |||
| @@ -197,9 +225,9 @@ | |||
| KERNEL2x4_1 | |||
| KERNEL2x4_E2 | |||
| b .LZGEMM_L2x4_SUB1 | |||
| b ZGEMM_L2x4_SUB1 | |||
| .LZGEMM_L2x4_SUB4: | |||
| ZGEMM_L2x4_SUB4: | |||
| KERNEL2x4_SUBI1 | |||
| KERNEL2x4_SUB1 | |||
| @@ -211,48 +239,48 @@ | |||
| KERNEL2x4_SUB1 | |||
| KERNEL2x4_SUB1 | |||
| b .LZGEMM_L2x4_SUB1 | |||
| b ZGEMM_L2x4_SUB1 | |||
| .LZGEMM_L2x4_SUB0: | |||
| ZGEMM_L2x4_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x4_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x4_SAVE | |||
| b .LZGEMM_L2x4_SUB2 | |||
| ble ZGEMM_L2x4_SAVE | |||
| b ZGEMM_L2x4_SUB2 | |||
| .LZGEMM_L2x4_SUB1: | |||
| ZGEMM_L2x4_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x4_SAVE | |||
| ble ZGEMM_L2x4_SAVE | |||
| .LZGEMM_L2x4_SUB2: | |||
| ZGEMM_L2x4_SUB2: | |||
| KERNEL2x4_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x4_SUB2 | |||
| bgt ZGEMM_L2x4_SUB2 | |||
| .LZGEMM_L2x4_SAVE: | |||
| ZGEMM_L2x4_SAVE: | |||
| SAVE2x4 | |||
| .LZGEMM_L2x4_END: | |||
| ZGEMM_L2x4_END: | |||
| .LZGEMM_L2x2_BEGIN: | |||
| ZGEMM_L2x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble .LZGEMM_L2x2_END | |||
| mr BO, B | |||
| ble ZGEMM_L2x2_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x2_SUB0 | |||
| ble ZGEMM_L2x2_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x2_SUB4 | |||
| ble ZGEMM_L2x2_SUB4 | |||
| .LZGEMM_L2x2_LOOP_START: | |||
| ZGEMM_L2x2_LOOP_START: | |||
| LOAD2x2_1 | |||
| KERNEL2x2_I1 | |||
| @@ -266,11 +294,11 @@ | |||
| KERNEL2x2_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x2_LOOP_END | |||
| ble ZGEMM_L2x2_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x2_LOOP: | |||
| ZGEMM_L2x2_LOOP: | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| @@ -283,9 +311,9 @@ | |||
| KERNEL2x2_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x2_LOOP | |||
| bgt ZGEMM_L2x2_LOOP | |||
| .LZGEMM_L2x2_LOOP_END: | |||
| ZGEMM_L2x2_LOOP_END: | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_2 | |||
| @@ -297,9 +325,9 @@ | |||
| KERNEL2x2_1 | |||
| KERNEL2x2_E2 | |||
| b .LZGEMM_L2x2_SUB1 | |||
| b ZGEMM_L2x2_SUB1 | |||
| .LZGEMM_L2x2_SUB4: | |||
| ZGEMM_L2x2_SUB4: | |||
| KERNEL2x2_SUBI1 | |||
| KERNEL2x2_SUB1 | |||
| @@ -311,48 +339,48 @@ | |||
| KERNEL2x2_SUB1 | |||
| KERNEL2x2_SUB1 | |||
| b .LZGEMM_L2x2_SUB1 | |||
| b ZGEMM_L2x2_SUB1 | |||
| .LZGEMM_L2x2_SUB0: | |||
| ZGEMM_L2x2_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x2_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x2_SAVE | |||
| b .LZGEMM_L2x2_SUB2 | |||
| ble ZGEMM_L2x2_SAVE | |||
| b ZGEMM_L2x2_SUB2 | |||
| .LZGEMM_L2x2_SUB1: | |||
| ZGEMM_L2x2_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x2_SAVE | |||
| ble ZGEMM_L2x2_SAVE | |||
| .LZGEMM_L2x2_SUB2: | |||
| ZGEMM_L2x2_SUB2: | |||
| KERNEL2x2_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x2_SUB2 | |||
| bgt ZGEMM_L2x2_SUB2 | |||
| .LZGEMM_L2x2_SAVE: | |||
| ZGEMM_L2x2_SAVE: | |||
| SAVE2x2 | |||
| .LZGEMM_L2x2_END: | |||
| ZGEMM_L2x2_END: | |||
| .LZGEMM_L2x1_BEGIN: | |||
| ZGEMM_L2x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble .LZGEMM_L2x1_END | |||
| mr BO, B | |||
| ble ZGEMM_L2x1_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L2x1_SUB0 | |||
| ble ZGEMM_L2x1_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L2x1_SUB4 | |||
| ble ZGEMM_L2x1_SUB4 | |||
| .LZGEMM_L2x1_LOOP_START: | |||
| ZGEMM_L2x1_LOOP_START: | |||
| LOAD2x1_1 | |||
| KERNEL2x1_I1 | |||
| @@ -366,11 +394,11 @@ | |||
| KERNEL2x1_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L2x1_LOOP_END | |||
| ble ZGEMM_L2x1_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L2x1_LOOP: | |||
| ZGEMM_L2x1_LOOP: | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| @@ -383,9 +411,9 @@ | |||
| KERNEL2x1_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x1_LOOP | |||
| bgt ZGEMM_L2x1_LOOP | |||
| .LZGEMM_L2x1_LOOP_END: | |||
| ZGEMM_L2x1_LOOP_END: | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_2 | |||
| @@ -397,9 +425,9 @@ | |||
| KERNEL2x1_1 | |||
| KERNEL2x1_E2 | |||
| b .LZGEMM_L2x1_SUB1 | |||
| b ZGEMM_L2x1_SUB1 | |||
| .LZGEMM_L2x1_SUB4: | |||
| ZGEMM_L2x1_SUB4: | |||
| KERNEL2x1_SUBI1 | |||
| KERNEL2x1_SUB1 | |||
| @@ -411,72 +439,89 @@ | |||
| KERNEL2x1_SUB1 | |||
| KERNEL2x1_SUB1 | |||
| b .LZGEMM_L2x1_SUB1 | |||
| b ZGEMM_L2x1_SUB1 | |||
| .LZGEMM_L2x1_SUB0: | |||
| ZGEMM_L2x1_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL2x1_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L2x1_SAVE | |||
| b .LZGEMM_L2x1_SUB2 | |||
| ble ZGEMM_L2x1_SAVE | |||
| b ZGEMM_L2x1_SUB2 | |||
| .LZGEMM_L2x1_SUB1: | |||
| ZGEMM_L2x1_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L2x1_SAVE | |||
| ble ZGEMM_L2x1_SAVE | |||
| .LZGEMM_L2x1_SUB2: | |||
| ZGEMM_L2x1_SUB2: | |||
| KERNEL2x1_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L2x1_SUB2 | |||
| bgt ZGEMM_L2x1_SUB2 | |||
| .LZGEMM_L2x1_SAVE: | |||
| ZGEMM_L2x1_SAVE: | |||
| SAVE2x1 | |||
| .LZGEMM_L2x1_END: | |||
| ZGEMM_L2x1_END: | |||
| slwi T1, K, 5 | |||
| add B, B, T1 | |||
| addic. J, J, -1 | |||
| bgt .LZGEMM_L2_BEGIN | |||
| bgt ZGEMM_L2_BEGIN | |||
| andi. T2, N, 1 | |||
| ble .L999 | |||
| ble L999 | |||
| .LZGEMM_L2_END: | |||
| ZGEMM_L2_END: | |||
| b .LZGEMM_L1_BEGIN | |||
| b ZGEMM_L1_BEGIN | |||
| .L999_H1: | |||
| L999_H1: | |||
| b .L999 | |||
| b L999 | |||
| ZGEMM_L1_BEGIN: | |||
| mr BO, B | |||
| mr BBO, BBUFFER | |||
| slwi T1, K, 0 | |||
| ZGEMM_L1_COPYB: | |||
| lxvdsx vs4, o0, BO // b0_r | |||
| lxvdsx vs5, o8, BO // b0_i | |||
| addi BO, BO, 16 | |||
| stxvd2x vs4, o0, BBO | |||
| stxvd2x vs5, o16, BBO | |||
| addic. T1, T1, -1 | |||
| addi BBO, BBO, 32 | |||
| bge ZGEMM_L1_COPYB | |||
| .LZGEMM_L1_BEGIN: | |||
| andi. T1, N, 1 | |||
| ble .LZGEMM_L1_END | |||
| ble ZGEMM_L1_END | |||
| mr CO, C | |||
| mr AO, A | |||
| srawi. I, M, 3 | |||
| ble .LZGEMM_L1x8_END | |||
| ble ZGEMM_L1x8_END | |||
| .LZGEMM_L1x8_BEGIN: | |||
| ZGEMM_L1x8_BEGIN: | |||
| mr BO, B | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x8_SUB0 | |||
| ble ZGEMM_L1x8_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x8_SUB4 | |||
| ble ZGEMM_L1x8_SUB4 | |||
| .LZGEMM_L1x8_LOOP_START: | |||
| ZGEMM_L1x8_LOOP_START: | |||
| dcbt AO, PRE | |||
| LOAD1x8_1 | |||
| @@ -499,11 +544,11 @@ | |||
| KERNEL1x8_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x8_LOOP_END | |||
| ble ZGEMM_L1x8_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x8_LOOP: | |||
| ZGEMM_L1x8_LOOP: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| @@ -524,9 +569,9 @@ | |||
| KERNEL1x8_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x8_LOOP | |||
| bgt ZGEMM_L1x8_LOOP | |||
| .LZGEMM_L1x8_LOOP_END: | |||
| ZGEMM_L1x8_LOOP_END: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_1 | |||
| @@ -545,9 +590,9 @@ | |||
| KERNEL1x8_1 | |||
| KERNEL1x8_E2 | |||
| b .LZGEMM_L1x8_SUB1 | |||
| b ZGEMM_L1x8_SUB1 | |||
| .LZGEMM_L1x8_SUB4: | |||
| ZGEMM_L1x8_SUB4: | |||
| dcbt AO, PRE | |||
| KERNEL1x8_SUBI1 | |||
| @@ -563,53 +608,53 @@ | |||
| KERNEL1x8_SUB1 | |||
| KERNEL1x8_SUB1 | |||
| b .LZGEMM_L1x8_SUB1 | |||
| b ZGEMM_L1x8_SUB1 | |||
| .LZGEMM_L1x8_SUB0: | |||
| ZGEMM_L1x8_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x8_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x8_SAVE | |||
| b .LZGEMM_L1x8_SUB2 | |||
| ble ZGEMM_L1x8_SAVE | |||
| b ZGEMM_L1x8_SUB2 | |||
| .LZGEMM_L1x8_SUB1: | |||
| ZGEMM_L1x8_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x8_SAVE | |||
| ble ZGEMM_L1x8_SAVE | |||
| .LZGEMM_L1x8_SUB2: | |||
| ZGEMM_L1x8_SUB2: | |||
| KERNEL1x8_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x8_SUB2 | |||
| bgt ZGEMM_L1x8_SUB2 | |||
| .LZGEMM_L1x8_SAVE: | |||
| ZGEMM_L1x8_SAVE: | |||
| SAVE1x8 | |||
| addic. I, I, -1 | |||
| bgt .LZGEMM_L1x8_BEGIN | |||
| bgt ZGEMM_L1x8_BEGIN | |||
| .LZGEMM_L1x8_END: | |||
| ZGEMM_L1x8_END: | |||
| .LZGEMM_L1x4_BEGIN: | |||
| ZGEMM_L1x4_BEGIN: | |||
| andi. T2, M, 7 | |||
| ble .LZGEMM_L1x1_END | |||
| ble ZGEMM_L1x1_END | |||
| andi. T1, M, 4 | |||
| ble .LZGEMM_L1x4_END | |||
| mr BO, B | |||
| ble ZGEMM_L1x4_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x4_SUB0 | |||
| ble ZGEMM_L1x4_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x4_SUB4 | |||
| ble ZGEMM_L1x4_SUB4 | |||
| .LZGEMM_L1x4_LOOP_START: | |||
| ZGEMM_L1x4_LOOP_START: | |||
| LOAD1x4_1 | |||
| KERNEL1x4_I1 | |||
| @@ -623,11 +668,11 @@ | |||
| KERNEL1x4_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x4_LOOP_END | |||
| ble ZGEMM_L1x4_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x4_LOOP: | |||
| ZGEMM_L1x4_LOOP: | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| @@ -640,9 +685,9 @@ | |||
| KERNEL1x4_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x4_LOOP | |||
| bgt ZGEMM_L1x4_LOOP | |||
| .LZGEMM_L1x4_LOOP_END: | |||
| ZGEMM_L1x4_LOOP_END: | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_2 | |||
| @@ -654,9 +699,9 @@ | |||
| KERNEL1x4_1 | |||
| KERNEL1x4_E2 | |||
| b .LZGEMM_L1x4_SUB1 | |||
| b ZGEMM_L1x4_SUB1 | |||
| .LZGEMM_L1x4_SUB4: | |||
| ZGEMM_L1x4_SUB4: | |||
| KERNEL1x4_SUBI1 | |||
| KERNEL1x4_SUB1 | |||
| @@ -668,48 +713,48 @@ | |||
| KERNEL1x4_SUB1 | |||
| KERNEL1x4_SUB1 | |||
| b .LZGEMM_L1x4_SUB1 | |||
| b ZGEMM_L1x4_SUB1 | |||
| .LZGEMM_L1x4_SUB0: | |||
| ZGEMM_L1x4_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x4_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x4_SAVE | |||
| b .LZGEMM_L1x4_SUB2 | |||
| ble ZGEMM_L1x4_SAVE | |||
| b ZGEMM_L1x4_SUB2 | |||
| .LZGEMM_L1x4_SUB1: | |||
| ZGEMM_L1x4_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x4_SAVE | |||
| ble ZGEMM_L1x4_SAVE | |||
| .LZGEMM_L1x4_SUB2: | |||
| ZGEMM_L1x4_SUB2: | |||
| KERNEL1x4_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x4_SUB2 | |||
| bgt ZGEMM_L1x4_SUB2 | |||
| .LZGEMM_L1x4_SAVE: | |||
| ZGEMM_L1x4_SAVE: | |||
| SAVE1x4 | |||
| .LZGEMM_L1x4_END: | |||
| ZGEMM_L1x4_END: | |||
| .LZGEMM_L1x2_BEGIN: | |||
| ZGEMM_L1x2_BEGIN: | |||
| andi. T1, M, 2 | |||
| ble .LZGEMM_L1x2_END | |||
| mr BO, B | |||
| ble ZGEMM_L1x2_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x2_SUB0 | |||
| ble ZGEMM_L1x2_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x2_SUB4 | |||
| ble ZGEMM_L1x2_SUB4 | |||
| .LZGEMM_L1x2_LOOP_START: | |||
| ZGEMM_L1x2_LOOP_START: | |||
| LOAD1x2_1 | |||
| KERNEL1x2_I1 | |||
| @@ -723,11 +768,11 @@ | |||
| KERNEL1x2_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x2_LOOP_END | |||
| ble ZGEMM_L1x2_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x2_LOOP: | |||
| ZGEMM_L1x2_LOOP: | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| @@ -740,9 +785,9 @@ | |||
| KERNEL1x2_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x2_LOOP | |||
| bgt ZGEMM_L1x2_LOOP | |||
| .LZGEMM_L1x2_LOOP_END: | |||
| ZGEMM_L1x2_LOOP_END: | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_2 | |||
| @@ -754,9 +799,9 @@ | |||
| KERNEL1x2_1 | |||
| KERNEL1x2_E2 | |||
| b .LZGEMM_L1x2_SUB1 | |||
| b ZGEMM_L1x2_SUB1 | |||
| .LZGEMM_L1x2_SUB4: | |||
| ZGEMM_L1x2_SUB4: | |||
| KERNEL1x2_SUBI1 | |||
| KERNEL1x2_SUB1 | |||
| @@ -768,48 +813,48 @@ | |||
| KERNEL1x2_SUB1 | |||
| KERNEL1x2_SUB1 | |||
| b .LZGEMM_L1x2_SUB1 | |||
| b ZGEMM_L1x2_SUB1 | |||
| .LZGEMM_L1x2_SUB0: | |||
| ZGEMM_L1x2_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x2_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x2_SAVE | |||
| b .LZGEMM_L1x2_SUB2 | |||
| ble ZGEMM_L1x2_SAVE | |||
| b ZGEMM_L1x2_SUB2 | |||
| .LZGEMM_L1x2_SUB1: | |||
| ZGEMM_L1x2_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x2_SAVE | |||
| ble ZGEMM_L1x2_SAVE | |||
| .LZGEMM_L1x2_SUB2: | |||
| ZGEMM_L1x2_SUB2: | |||
| KERNEL1x2_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x2_SUB2 | |||
| bgt ZGEMM_L1x2_SUB2 | |||
| .LZGEMM_L1x2_SAVE: | |||
| ZGEMM_L1x2_SAVE: | |||
| SAVE1x2 | |||
| .LZGEMM_L1x2_END: | |||
| ZGEMM_L1x2_END: | |||
| .LZGEMM_L1x1_BEGIN: | |||
| ZGEMM_L1x1_BEGIN: | |||
| andi. T1, M, 1 | |||
| ble .LZGEMM_L1x1_END | |||
| mr BO, B | |||
| ble ZGEMM_L1x1_END | |||
| mr BO, BBUFFER | |||
| srawi. L, K, 3 | |||
| ble .LZGEMM_L1x1_SUB0 | |||
| ble ZGEMM_L1x1_SUB0 | |||
| cmpwi cr0, L, 1 | |||
| ble .LZGEMM_L1x1_SUB4 | |||
| ble ZGEMM_L1x1_SUB4 | |||
| .LZGEMM_L1x1_LOOP_START: | |||
| ZGEMM_L1x1_LOOP_START: | |||
| LOAD1x1_1 | |||
| KERNEL1x1_I1 | |||
| @@ -823,11 +868,11 @@ | |||
| KERNEL1x1_2 | |||
| addic. L, L, -2 | |||
| ble .LZGEMM_L1x1_LOOP_END | |||
| ble ZGEMM_L1x1_LOOP_END | |||
| .align 5 | |||
| .LZGEMM_L1x1_LOOP: | |||
| ZGEMM_L1x1_LOOP: | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| @@ -840,9 +885,9 @@ | |||
| KERNEL1x1_2 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x1_LOOP | |||
| bgt ZGEMM_L1x1_LOOP | |||
| .LZGEMM_L1x1_LOOP_END: | |||
| ZGEMM_L1x1_LOOP_END: | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_2 | |||
| @@ -854,9 +899,9 @@ | |||
| KERNEL1x1_1 | |||
| KERNEL1x1_E2 | |||
| b .LZGEMM_L1x1_SUB1 | |||
| b ZGEMM_L1x1_SUB1 | |||
| .LZGEMM_L1x1_SUB4: | |||
| ZGEMM_L1x1_SUB4: | |||
| KERNEL1x1_SUBI1 | |||
| KERNEL1x1_SUB1 | |||
| @@ -868,34 +913,34 @@ | |||
| KERNEL1x1_SUB1 | |||
| KERNEL1x1_SUB1 | |||
| b .LZGEMM_L1x1_SUB1 | |||
| b ZGEMM_L1x1_SUB1 | |||
| .LZGEMM_L1x1_SUB0: | |||
| ZGEMM_L1x1_SUB0: | |||
| andi. L, K, 7 | |||
| KERNEL1x1_SUBI1 | |||
| addic. L, L, -1 | |||
| ble .LZGEMM_L1x1_SAVE | |||
| b .LZGEMM_L1x1_SUB2 | |||
| ble ZGEMM_L1x1_SAVE | |||
| b ZGEMM_L1x1_SUB2 | |||
| .LZGEMM_L1x1_SUB1: | |||
| ZGEMM_L1x1_SUB1: | |||
| andi. L, K, 7 | |||
| ble .LZGEMM_L1x1_SAVE | |||
| ble ZGEMM_L1x1_SAVE | |||
| .LZGEMM_L1x1_SUB2: | |||
| ZGEMM_L1x1_SUB2: | |||
| KERNEL1x1_SUB1 | |||
| addic. L, L, -1 | |||
| bgt .LZGEMM_L1x1_SUB2 | |||
| bgt ZGEMM_L1x1_SUB2 | |||
| .LZGEMM_L1x1_SAVE: | |||
| ZGEMM_L1x1_SAVE: | |||
| SAVE1x1 | |||
| .LZGEMM_L1x1_END: | |||
| ZGEMM_L1x1_END: | |||
| .LZGEMM_L1_END: | |||
| ZGEMM_L1_END: | |||
| @@ -1,39 +1,3 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/05 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | |||
| #define XSFADD_R1 xsadddp | |||
| @@ -70,12 +34,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD2x8_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -110,12 +74,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -156,36 +120,41 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro KERNEL2x8_1 | |||
| lxvd2x vs8, o0, AO // load real,imag from A | |||
| lxvd2x vs9, o16, AO // load real,imag from A | |||
| lxvd2x vs10, o32, AO // load real,imag from A | |||
| lxvd2x vs11, o48, AO // load real,imag from A | |||
| addi AO, AO, 64 | |||
| lxvd2x vs12, o0, AO // load real,imag from A | |||
| lxvd2x vs13, o16, AO // load real,imag from A | |||
| lxvd2x vs14, o32, AO // load real,imag from A | |||
| lxvd2x vs15, o48, AO // load real,imag from A | |||
| addi AO, AO, 64 | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| xvmaddadp vs34, vs1, vs16 // real*real, imag*real | |||
| xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| xvmaddadp vs36, vs2, vs16 // real*real, imag*real | |||
| xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag | |||
| xvmaddadp vs38, vs3, vs16 // real*real, imag*real | |||
| xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag | |||
| lxvd2x vs8, o0, AO // load real,imag from A | |||
| lxvd2x vs9, o16, AO // load real,imag from A | |||
| xvmaddadp vs40, vs4, vs16 // real*real, imag*real | |||
| xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag | |||
| xvmaddadp vs42, vs5, vs16 // real*real, imag*real | |||
| xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag | |||
| lxvd2x vs10, o32, AO // load real,imag from A | |||
| lxvd2x vs11, o48, AO // load real,imag from A | |||
| xvmaddadp vs44, vs6, vs16 // real*real, imag*real | |||
| xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs46, vs7, vs16 // real*real, imag*real | |||
| xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag | |||
| @@ -193,101 +162,79 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag | |||
| xvmaddadp vs50, vs1, vs18 // real*real, imag*real | |||
| xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag | |||
| lxvd2x vs12, o0, AO // load real,imag from A | |||
| lxvd2x vs13, o16, AO // load real,imag from A | |||
| xvmaddadp vs52, vs2, vs18 // real*real, imag*real | |||
| xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag | |||
| xvmaddadp vs54, vs3, vs18 // real*real, imag*real | |||
| xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag | |||
| lxvd2x vs14, o32, AO // load real,imag from A | |||
| lxvd2x vs15, o48, AO // load real,imag from A | |||
| xvmaddadp vs56, vs4, vs18 // real*real, imag*real | |||
| xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag | |||
| xvmaddadp vs58, vs5, vs18 // real*real, imag*real | |||
| xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| xvmaddadp vs60, vs6, vs18 // real*real, imag*real | |||
| xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag | |||
| xvmaddadp vs62, vs7, vs18 // real*real, imag*real | |||
| xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| .macro KERNEL2x8_2 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| lxvd2x vs2, o32, AO // load real,imag from A | |||
| lxvd2x vs3, o48, AO // load real,imag from A | |||
| addi AO, AO, 64 | |||
| lxvd2x vs4, o0, AO // load real,imag from A | |||
| lxvd2x vs5, o16, AO // load real,imag from A | |||
| lxvd2x vs6, o32, AO // load real,imag from A | |||
| lxvd2x vs7, o48, AO // load real,imag from A | |||
| addi AO, AO, 64 | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| xvmaddadp vs34, vs9, vs20 // real*real, imag*real | |||
| xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| xvmaddadp vs36, vs10, vs20 // real*real, imag*real | |||
| xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag | |||
| xvmaddadp vs38, vs11, vs20 // real*real, imag*real | |||
| xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| xvmaddadp vs40, vs12, vs20 // real*real, imag*real | |||
| xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag | |||
| xvmaddadp vs42, vs13, vs20 // real*real, imag*real | |||
| xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag | |||
| lxvd2x vs2, o32, AO // load real,imag from A | |||
| lxvd2x vs3, o48, AO // load real,imag from A | |||
| xvmaddadp vs44, vs14, vs20 // real*real, imag*real | |||
| xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag | |||
| xvmaddadp vs46, vs15, vs20 // real*real, imag*real | |||
| xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag | |||
| addi AO, AO, 64 | |||
| xvmaddadp vs48, vs8, vs22 // real*real, imag*real | |||
| xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag | |||
| xvmaddadp vs50, vs9, vs22 // real*real, imag*real | |||
| xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag | |||
| lxvd2x vs4, o0, AO // load real,imag from A | |||
| lxvd2x vs5, o16, AO // load real,imag from A | |||
| xvmaddadp vs52, vs10, vs22 // real*real, imag*real | |||
| xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag | |||
| xvmaddadp vs54, vs11, vs22 // real*real, imag*real | |||
| xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag | |||
| lxvd2x vs6, o32, AO // load real,imag from A | |||
| lxvd2x vs7, o48, AO // load real,imag from A | |||
| xvmaddadp vs56, vs12, vs22 // real*real, imag*real | |||
| xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag | |||
| xvmaddadp vs58, vs13, vs22 // real*real, imag*real | |||
| xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| xvmaddadp vs60, vs14, vs22 // real*real, imag*real | |||
| xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag | |||
| xvmaddadp vs62, vs15, vs22 // real*real, imag*real | |||
| xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag | |||
| addi AO, AO, 64 | |||
| addi BO, BO, 32 | |||
| .endm | |||
| @@ -347,12 +294,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -407,12 +354,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -927,12 +874,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD2x4_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -953,12 +900,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -990,12 +937,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1027,12 +974,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -1088,12 +1035,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1125,12 +1072,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1410,12 +1357,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD2x2_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -1432,12 +1379,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1459,12 +1406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1486,12 +1433,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -1529,12 +1476,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1556,12 +1503,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1725,12 +1672,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD2x1_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| @@ -1745,12 +1692,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1767,12 +1714,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvdsx vs22, o16, BO // load real part from B | |||
| lxvdsx vs23, o24, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| lxvd2x vs22, o32, BO // load real part from B | |||
| lxvd2x vs23, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1789,12 +1736,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -1823,12 +1770,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1845,12 +1792,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvdsx vs18, o16, BO // load real part from B | |||
| lxvdsx vs19, o24, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| lxvd2x vs18, o32, BO // load real part from B | |||
| lxvd2x vs19, o48, BO // load imag part from B | |||
| addi BO, BO, 32 | |||
| addi BO, BO, 64 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -1956,10 +1903,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD1x8_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -1994,10 +1941,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2035,10 +1982,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2076,10 +2023,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -2140,10 +2087,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2181,10 +2128,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2452,10 +2399,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD1x4_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -2476,10 +2423,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2502,10 +2449,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2528,10 +2475,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -2569,10 +2516,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2595,10 +2542,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 64 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2748,10 +2695,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD1x2_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| lxvd2x vs1, o16, AO // load real,imag from A | |||
| @@ -2768,10 +2715,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2788,10 +2735,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2808,10 +2755,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -2839,10 +2786,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2859,10 +2806,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 32 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2954,10 +2901,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| .macro LOAD1x1_1 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| lxvd2x vs0, o0, AO // load real,imag from A | |||
| @@ -2972,10 +2919,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -2989,10 +2936,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs20, o0, BO // load real part from B | |||
| lxvdsx vs21, o8, BO // load imag part from B | |||
| lxvd2x vs20, o0, BO // load real part from B | |||
| lxvd2x vs21, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -3006,10 +2953,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs8, vs20 // real*real, imag*real | |||
| xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag | |||
| @@ -3032,10 +2979,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmuldp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmuldp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -3049,10 +2996,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| addi AO, AO, 16 | |||
| lxvdsx vs16, o0, BO // load real part from B | |||
| lxvdsx vs17, o8, BO // load imag part from B | |||
| lxvd2x vs16, o0, BO // load real part from B | |||
| lxvd2x vs17, o16, BO // load imag part from B | |||
| addi BO, BO, 16 | |||
| addi BO, BO, 32 | |||
| xvmaddadp vs32, vs0, vs16 // real*real, imag*real | |||
| xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag | |||
| @@ -0,0 +1,176 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) | |||
| #include "zscal_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT *x1=x; | |||
| FLOAT alpha_r1=alpha[0]; | |||
| FLOAT alpha_r2=alpha[1]; | |||
| FLOAT alpha_i1=alpha[2]; | |||
| FLOAT alpha_i2=alpha[3]; | |||
| FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31; | |||
| FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i; | |||
| while ( i<n ) | |||
| { | |||
| x0_r = x1[0]; | |||
| x0_i = x1[1]; | |||
| x1_r = x1[2]; | |||
| x1_i = x1[3]; | |||
| x2_r = x1[4]; | |||
| x2_i = x1[5]; | |||
| x3_r = x1[6]; | |||
| x3_i = x1[7]; | |||
| temp00 = x0_r * alpha_r1; | |||
| temp10 = x1_r * alpha_r1; | |||
| temp20 = x2_r * alpha_r1; | |||
| temp30 = x3_r * alpha_r1; | |||
| temp01 = x0_i * alpha_r2; | |||
| temp11 = x1_i * alpha_r2; | |||
| temp21 = x2_i * alpha_r2; | |||
| temp31 = x3_i * alpha_r2; | |||
| temp00 += x0_i * alpha_i1; | |||
| temp10 += x1_i * alpha_i1; | |||
| temp20 += x2_i * alpha_i1; | |||
| temp30 += x3_i * alpha_i1; | |||
| temp01 += x0_r * alpha_i2; | |||
| temp11 += x1_r * alpha_i2; | |||
| temp21 += x2_r * alpha_i2; | |||
| temp31 += x3_r * alpha_i2; | |||
| x1[0] = temp00; | |||
| x1[1] = temp01; | |||
| x1[2] = temp10; | |||
| x1[3] = temp11; | |||
| x1[4] = temp20; | |||
| x1[5] = temp21; | |||
| x1[6] = temp30; | |||
| x1[7] = temp31; | |||
| x1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG inc_x2; | |||
| BLASLONG ip = 0; | |||
| FLOAT temp; | |||
| FLOAT alpha[4] __attribute__ ((aligned (16)));; | |||
| BLASLONG n1; | |||
| if ( n <= 0 ) | |||
| return(0); | |||
| if ( inc_x <= 0 ) | |||
| return(0); | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| alpha[0] = da_r; | |||
| alpha[1] = da_r; | |||
| alpha[2] = -da_i; | |||
| alpha[3] = da_i; | |||
| zscal_kernel_8(n1, x, alpha); | |||
| i=n1; | |||
| ip = n1 * 2; | |||
| } | |||
| while ( i < n ) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| ip += 2; | |||
| i++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| while ( i < n ) | |||
| { | |||
| temp = da_r * x[ip] - da_i * x[ip+1] ; | |||
| x[ip+1] = da_r * x[ip+1] + da_i * x[ip] ; | |||
| x[ip] = temp; | |||
| ip += inc_x2; | |||
| i++; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,224 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/25 Werner Saar (wernsaar@googlemail.com) | |||
| * | |||
| * I don't use fused multipy-add ( lapack precision problems ) | |||
| * | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *x2=x+1; | |||
| BLASLONG pre = 384; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r | |||
| "lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i | |||
| "addi %1, %1, -8 \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "dcbt %2, %4 \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "xvmuldp 52, 44, 32 \n\t" | |||
| "xvmuldp 53, 45, 32 \n\t" | |||
| "xvmuldp 54, 46, 32 \n\t" | |||
| "xvmuldp 55, 47, 32 \n\t" | |||
| "xxswapd 56, 40 \n\t" | |||
| "xxswapd 57, 41 \n\t" | |||
| "xxswapd 58, 42 \n\t" | |||
| "xxswapd 59, 43 \n\t" | |||
| "xxswapd 60, 44 \n\t" | |||
| "xxswapd 61, 45 \n\t" | |||
| "xxswapd 62, 46 \n\t" | |||
| "xxswapd 63, 47 \n\t" | |||
| "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmuldp 57, 57, 33 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "xvmuldp 58, 58, 33 \n\t" | |||
| "xvmuldp 59, 59, 33 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "xvmuldp 60, 60, 33 \n\t" | |||
| "xvmuldp 61, 61, 33 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "xvmuldp 62, 62, 33 \n\t" | |||
| "xvmuldp 63, 63, 33 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "xvadddp 48, 48 , 56 \n\t" | |||
| "xvadddp 49, 49 , 57 \n\t" | |||
| "xvadddp 50, 50 , 58 \n\t" | |||
| "xvadddp 51, 51 , 59 \n\t" | |||
| "stxvd2x 48, 0, %1 \n\t" | |||
| "stxvd2x 49, %5, %1 \n\t" | |||
| "xvadddp 52, 52 , 60 \n\t" | |||
| "xvadddp 53, 53 , 61 \n\t" | |||
| "stxvd2x 50, %6, %1 \n\t" | |||
| "stxvd2x 51, %7, %1 \n\t" | |||
| "xvadddp 54, 54 , 62 \n\t" | |||
| "xvadddp 55, 55 , 63 \n\t" | |||
| "stxvd2x 52, %8, %1 \n\t" | |||
| "stxvd2x 53, %9, %1 \n\t" | |||
| "stxvd2x 54, %10, %1 \n\t" | |||
| "stxvd2x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "addic. %0 , %0 , -8 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r | |||
| "xvmuldp 49, 41, 32 \n\t" | |||
| "xvmuldp 50, 42, 32 \n\t" | |||
| "xvmuldp 51, 43, 32 \n\t" | |||
| "xvmuldp 52, 44, 32 \n\t" | |||
| "xvmuldp 53, 45, 32 \n\t" | |||
| "xvmuldp 54, 46, 32 \n\t" | |||
| "xvmuldp 55, 47, 32 \n\t" | |||
| "xxswapd 56, 40 \n\t" | |||
| "xxswapd 57, 41 \n\t" | |||
| "xxswapd 58, 42 \n\t" | |||
| "xxswapd 59, 43 \n\t" | |||
| "xxswapd 60, 44 \n\t" | |||
| "xxswapd 61, 45 \n\t" | |||
| "xxswapd 62, 46 \n\t" | |||
| "xxswapd 63, 47 \n\t" | |||
| "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i | |||
| "xvmuldp 57, 57, 33 \n\t" | |||
| "xvmuldp 58, 58, 33 \n\t" | |||
| "xvmuldp 59, 59, 33 \n\t" | |||
| "xvmuldp 60, 60, 33 \n\t" | |||
| "xvmuldp 61, 61, 33 \n\t" | |||
| "xvmuldp 62, 62, 33 \n\t" | |||
| "xvmuldp 63, 63, 33 \n\t" | |||
| "xvadddp 48, 48 , 56 \n\t" | |||
| "xvadddp 49, 49 , 57 \n\t" | |||
| "xvadddp 50, 50 , 58 \n\t" | |||
| "xvadddp 51, 51 , 59 \n\t" | |||
| "xvadddp 52, 52 , 60 \n\t" | |||
| "xvadddp 53, 53 , 61 \n\t" | |||
| "xvadddp 54, 54 , 62 \n\t" | |||
| "xvadddp 55, 55 , 63 \n\t" | |||
| "stxvd2x 48, 0, %1 \n\t" | |||
| "stxvd2x 49, %5, %1 \n\t" | |||
| "stxvd2x 50, %6, %1 \n\t" | |||
| "stxvd2x 51, %7, %1 \n\t" | |||
| "stxvd2x 52, %8, %1 \n\t" | |||
| "stxvd2x 53, %9, %1 \n\t" | |||
| "stxvd2x 54, %10, %1 \n\t" | |||
| "stxvd2x 55, %11, %1 \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (x2), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| "r" (pre), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,175 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #include "zswap_microk_power8.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT f0, f1, f2, f3, f4, f5, f6, f7; | |||
| FLOAT g0, g1, g2, g3, g4, g5, g6, g7; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| while ( i<n ) | |||
| { | |||
| f0 = x1[0]; | |||
| f1 = x1[1]; | |||
| f2 = x1[2]; | |||
| f3 = x1[3]; | |||
| f4 = x1[4]; | |||
| f5 = x1[5]; | |||
| f6 = x1[6]; | |||
| f7 = x1[7]; | |||
| g0 = y1[0]; | |||
| g1 = y1[1]; | |||
| g2 = y1[2]; | |||
| g3 = y1[3]; | |||
| g4 = y1[4]; | |||
| g5 = y1[5]; | |||
| g6 = y1[6]; | |||
| g7 = y1[7]; | |||
| y1[0] = f0; | |||
| y1[1] = f1; | |||
| y1[2] = f2; | |||
| y1[3] = f3; | |||
| y1[4] = f4; | |||
| y1[5] = f5; | |||
| y1[6] = f6; | |||
| y1[7] = f7; | |||
| x1[0] = g0; | |||
| x1[1] = g1; | |||
| x1[2] = g2; | |||
| x1[3] = g3; | |||
| x1[4] = g4; | |||
| x1[5] = g5; | |||
| x1[6] = g6; | |||
| x1[7] = g7; | |||
| x1 += 8; | |||
| y1 += 8; | |||
| i+=4; | |||
| } | |||
| return; | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2, inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1 )) | |||
| { | |||
| BLASLONG n1 = n & -16; | |||
| if ( n1 > 0 ) | |||
| { | |||
| zswap_kernel_16(n1, x, y); | |||
| i=n1; | |||
| ix = 2* n1; | |||
| iy = 2* n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp[0] = x[ix] ; | |||
| temp[1] = x[ix+1] ; | |||
| x[ix] = y[iy] ; | |||
| x[ix+1] = y[iy+1] ; | |||
| y[iy] = temp[0] ; | |||
| y[iy+1] = temp[1] ; | |||
| ix += 2 ; | |||
| iy += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x; | |||
| inc_y2 = 2 * inc_y; | |||
| while(i < n) | |||
| { | |||
| temp[0] = x[ix] ; | |||
| temp[1] = x[ix+1] ; | |||
| x[ix] = y[iy] ; | |||
| x[ix+1] = y[iy+1] ; | |||
| y[iy] = temp[0] ; | |||
| y[iy+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,180 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/27 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i = n; | |||
| BLASLONG o16 = 16; | |||
| BLASLONG o32 = 32; | |||
| BLASLONG o48 = 48; | |||
| BLASLONG o64 = 64; | |||
| BLASLONG o80 = 80; | |||
| BLASLONG o96 = 96; | |||
| BLASLONG o112 = 112; | |||
| FLOAT *x1=x; | |||
| FLOAT *y1=y; | |||
| FLOAT *x2=x+1; | |||
| FLOAT *y2=y+1; | |||
| BLASLONG pre = 384; | |||
| BLASLONG alpha=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "addi %3, %3, -8 \n\t" | |||
| "addi %4, %4, -8 \n\t" | |||
| ".align 5 \n\t" | |||
| "1: \n\t" | |||
| "lxvd2x 32, 0, %2 \n\t" | |||
| "lxvd2x 33, %5, %2 \n\t" | |||
| "lxvd2x 34, %6, %2 \n\t" | |||
| "lxvd2x 35, %7, %2 \n\t" | |||
| "lxvd2x 36, %8, %2 \n\t" | |||
| "lxvd2x 37, %9, %2 \n\t" | |||
| "lxvd2x 38, %10, %2 \n\t" | |||
| "lxvd2x 39, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 40, 0, %2 \n\t" | |||
| "lxvd2x 41, %5, %2 \n\t" | |||
| "lxvd2x 42, %6, %2 \n\t" | |||
| "lxvd2x 43, %7, %2 \n\t" | |||
| "lxvd2x 44, %8, %2 \n\t" | |||
| "lxvd2x 45, %9, %2 \n\t" | |||
| "lxvd2x 46, %10, %2 \n\t" | |||
| "lxvd2x 47, %11, %2 \n\t" | |||
| "addi %2, %2, 128 \n\t" | |||
| "lxvd2x 48, 0, %1 \n\t" | |||
| "lxvd2x 49, %5, %1 \n\t" | |||
| "lxvd2x 50, %6, %1 \n\t" | |||
| "lxvd2x 51, %7, %1 \n\t" | |||
| "lxvd2x 52, %8, %1 \n\t" | |||
| "lxvd2x 53, %9, %1 \n\t" | |||
| "lxvd2x 54, %10, %1 \n\t" | |||
| "lxvd2x 55, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "lxvd2x 56, 0, %1 \n\t" | |||
| "lxvd2x 57, %5, %1 \n\t" | |||
| "lxvd2x 58, %6, %1 \n\t" | |||
| "lxvd2x 59, %7, %1 \n\t" | |||
| "lxvd2x 60, %8, %1 \n\t" | |||
| "lxvd2x 61, %9, %1 \n\t" | |||
| "lxvd2x 62, %10, %1 \n\t" | |||
| "lxvd2x 63, %11, %1 \n\t" | |||
| "addi %1, %1, 128 \n\t" | |||
| "stxvd2x 32, 0, %3 \n\t" | |||
| "stxvd2x 33, %5, %3 \n\t" | |||
| "stxvd2x 34, %6, %3 \n\t" | |||
| "stxvd2x 35, %7, %3 \n\t" | |||
| "stxvd2x 36, %8, %3 \n\t" | |||
| "stxvd2x 37, %9, %3 \n\t" | |||
| "stxvd2x 38, %10, %3 \n\t" | |||
| "stxvd2x 39, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvd2x 40, 0, %3 \n\t" | |||
| "stxvd2x 41, %5, %3 \n\t" | |||
| "stxvd2x 42, %6, %3 \n\t" | |||
| "stxvd2x 43, %7, %3 \n\t" | |||
| "stxvd2x 44, %8, %3 \n\t" | |||
| "stxvd2x 45, %9, %3 \n\t" | |||
| "stxvd2x 46, %10, %3 \n\t" | |||
| "stxvd2x 47, %11, %3 \n\t" | |||
| "addi %3, %3, 128 \n\t" | |||
| "stxvd2x 48, 0, %4 \n\t" | |||
| "stxvd2x 49, %5, %4 \n\t" | |||
| "stxvd2x 50, %6, %4 \n\t" | |||
| "stxvd2x 51, %7, %4 \n\t" | |||
| "stxvd2x 52, %8, %4 \n\t" | |||
| "stxvd2x 53, %9, %4 \n\t" | |||
| "stxvd2x 54, %10, %4 \n\t" | |||
| "stxvd2x 55, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "stxvd2x 56, 0, %4 \n\t" | |||
| "stxvd2x 57, %5, %4 \n\t" | |||
| "stxvd2x 58, %6, %4 \n\t" | |||
| "stxvd2x 59, %7, %4 \n\t" | |||
| "stxvd2x 60, %8, %4 \n\t" | |||
| "stxvd2x 61, %9, %4 \n\t" | |||
| "stxvd2x 62, %10, %4 \n\t" | |||
| "stxvd2x 63, %11, %4 \n\t" | |||
| "addi %4, %4, 128 \n\t" | |||
| "addic. %0 , %0 , -16 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (y1), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (y2), // 3 | |||
| "r" (x2), // 4 | |||
| "r" (o16), // 5 | |||
| "r" (o32), // 6 | |||
| "r" (o48), // 7 | |||
| "r" (o64), // 8 | |||
| "r" (o80), // 9 | |||
| "r" (o96), // 10 | |||
| "r" (o112) // 11 | |||
| : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory" | |||
| ); | |||
| } | |||
| @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #endif | |||
| #include "zgemm_macros_8x2_power8.S" | |||
| #include "ztrmm_macros_8x2_power8.S" | |||
| cmpwi cr0, M, 0 | |||
| ble .L999 | |||
| @@ -24,7 +24,7 @@ SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| @@ -72,18 +72,20 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| double dot = 0.0 ; | |||
| FLOAT dot = 0.0 ; | |||
| FLOAT mydot=0.0; | |||
| BLASLONG n1; | |||
| if ( n <= 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| n1 = n & (BLASLONG)(-32); | |||
| if ( n1 ) | |||
| sdot_kernel_16(n1, x, y , &dot ); | |||
| sdot_kernel_16(n1, x, y , &mydot ); | |||
| i = n1; | |||
| @@ -94,12 +96,13 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| i++ ; | |||
| } | |||
| dot+=mydot; | |||
| return(dot); | |||
| } | |||
| BLASLONG n1 = n & -2; | |||
| n1 = n & (BLASLONG)(-2); | |||
| while(i < n1) | |||
| { | |||
| @@ -124,4 +127,3 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| } | |||
| @@ -1961,35 +1961,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #if defined(POWER8) | |||
| #define SNUMOPT 4 | |||
| #define SNUMOPT 16 | |||
| #define DNUMOPT 8 | |||
| #define GEMM_DEFAULT_OFFSET_A 384 | |||
| #define GEMM_DEFAULT_OFFSET_B 1024 | |||
| #define GEMM_DEFAULT_OFFSET_A 4096 | |||
| #define GEMM_DEFAULT_OFFSET_B 4096 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 8 | |||
| #define DGEMM_DEFAULT_UNROLL_M 16 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 8 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 992 | |||
| #define SGEMM_DEFAULT_P 960 | |||
| #define DGEMM_DEFAULT_P 480 | |||
| #define CGEMM_DEFAULT_P 488 | |||
| #define ZGEMM_DEFAULT_P 240 | |||
| #define CGEMM_DEFAULT_P 720 | |||
| #define ZGEMM_DEFAULT_P 480 | |||
| #define SGEMM_DEFAULT_Q 504 | |||
| #define SGEMM_DEFAULT_Q 720 | |||
| #define DGEMM_DEFAULT_Q 720 | |||
| #define CGEMM_DEFAULT_Q 400 | |||
| #define ZGEMM_DEFAULT_Q 360 | |||
| #define CGEMM_DEFAULT_Q 720 | |||
| #define ZGEMM_DEFAULT_Q 720 | |||
| #define SGEMM_DEFAULT_R 28800 | |||
| #define SGEMM_DEFAULT_R 21600 | |||
| #define DGEMM_DEFAULT_R 14400 | |||
| #define ZGEMM_DEFAULT_R 7200 | |||
| #define CGEMM_DEFAULT_R 16200 | |||
| #define ZGEMM_DEFAULT_R 21600 | |||
| #define SYMV_P 8 | |||