Add new targets for ARM64tags/v0.2.20^2
| @@ -9,3 +9,17 @@ CCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| FCOMMON_OPT += -march=armv8-a+crc+crypto+fp+simd -mtune=cortex-a57 | |||
| endif | |||
| ifeq ($(CORE), VULCAN) | |||
| CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| endif | |||
| ifeq ($(CORE), THUNDERX) | |||
| CCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
| FCOMMON_OPT += -mtune=thunderx -mcpu=thunderx | |||
| endif | |||
| ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| FCOMMON_OPT += -mtune=vulcan -mcpu=vulcan | |||
| endif | |||
| @@ -80,4 +80,7 @@ ARMV5 | |||
| 8.ARM 64-bit CPU: | |||
| ARMV8 | |||
| CORTEXA57 | |||
| VULCAN | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| @@ -2193,7 +2193,7 @@ | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG sgemm_p; | |||
| @@ -30,17 +30,26 @@ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV8 1 | |||
| #define CPU_CORTEXA57 2 | |||
| #define CPU_VULCAN 3 | |||
| #define CPU_THUNDERX 4 | |||
| #define CPU_THUNDERX2T99 5 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "ARMV8" , | |||
| "CORTEXA57" | |||
| "CORTEXA57", | |||
| "VULCAN", | |||
| "THUNDERX", | |||
| "THUNDERX2T99" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "unknown", | |||
| "armv8" , | |||
| "cortexa57" | |||
| "cortexa57", | |||
| "vulcan", | |||
| "thunderx", | |||
| "thunderx2t99" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -85,25 +94,34 @@ int detect(void) | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; | |||
| p = (char *) NULL ; | |||
| if (!strncmp("CPU part", buffer, 8)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) { | |||
| if ((cpu_part != NULL) && (cpu_implementer != NULL)) { | |||
| break; | |||
| } | |||
| if ((cpu_part == NULL) && !strncmp("CPU part", buffer, 8)) { | |||
| cpu_part = strchr(buffer, ':') + 2; | |||
| cpu_part = strdup(cpu_part); | |||
| } else if ((cpu_implementer == NULL) && !strncmp("CPU implementer", buffer, 15)) { | |||
| cpu_implementer = strchr(buffer, ':') + 2; | |||
| cpu_implementer = strdup(cpu_implementer); | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL) { | |||
| if (strstr(p, "0xd07")) { | |||
| return CPU_CORTEXA57; | |||
| } | |||
| if(cpu_part != NULL && cpu_implementer != NULL) { | |||
| if (strstr(cpu_part, "0xd07") && strstr(cpu_implementer, "0x41")) | |||
| return CPU_CORTEXA57; | |||
| else if (strstr(cpu_part, "0x516") && strstr(cpu_implementer, "0x42")) | |||
| return CPU_VULCAN; | |||
| else if (strstr(cpu_part, "0x0a1") && strstr(cpu_implementer, "0x43")) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_part, "0xFFF") && strstr(cpu_implementer, "0x43")) /* TODO */ | |||
| return CPU_THUNDERX2T99; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -176,6 +194,28 @@ void get_cpuconfig(void) | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_VULCAN: | |||
| printf("#define VULCAN \n"); | |||
| printf("#define HAVE_VFP \n"); | |||
| printf("#define HAVE_VFPV3 \n"); | |||
| printf("#define HAVE_NEON \n"); | |||
| printf("#define HAVE_VFPV4 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_CORTEXA57: | |||
| printf("#define CORTEXA57\n"); | |||
| printf("#define HAVE_VFP\n"); | |||
| @@ -191,8 +231,42 @@ void get_cpuconfig(void) | |||
| printf("#define L2_SIZE 2097152\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define ARMV8\n"); | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 16777216\n"); | |||
| printf("#define L2_LINESIZE 128\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define VULCAN \n"); | |||
| printf("#define HAVE_VFP \n"); | |||
| printf("#define HAVE_VFPV3 \n"); | |||
| printf("#define HAVE_NEON \n"); | |||
| printf("#define HAVE_VFPV4 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -995,7 +995,7 @@ void *blas_memory_alloc(int procpos){ | |||
| if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number(); | |||
| #endif | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| #ifndef DYNAMIC_ARCH | |||
| blas_set_parameter(); | |||
| #endif | |||
| @@ -727,3 +727,26 @@ void blas_set_parameter(void){ | |||
| } | |||
| #endif | |||
| #if defined(ARCH_ARM64) | |||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||
| unsigned long dgemm_prefetch_size_a; | |||
| unsigned long dgemm_prefetch_size_b; | |||
| unsigned long dgemm_prefetch_size_c; | |||
| #endif | |||
| void blas_set_parameter(void) | |||
| { | |||
| #if defined(VULCAN) || defined(THUNDERX2T99) | |||
| dgemm_p = 160; | |||
| dgemm_q = 128; | |||
| dgemm_r = 4096; | |||
| dgemm_prefetch_size_a = 3584; | |||
| dgemm_prefetch_size_b = 512; | |||
| dgemm_prefetch_size_c = 128; | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -884,7 +884,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef FORCE_CORTEXA57 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "ARMV8" | |||
| #define SUBARCHITECTURE "CORTEXA57" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DCORTEXA57 " \ | |||
| "-DL1_CODE_SIZE=49152 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=3 " \ | |||
| @@ -897,6 +897,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_VULCAN | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "VULCAN" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DVULCAN " \ | |||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "vulcan" | |||
| #define CORENAME "VULCAN" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_THUNDERX | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "THUNDERX" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTHUNDERX " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=16777216 -DL2_LINESIZE=128 -DL2_ASSOCIATIVE=16 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " | |||
| #define LIBNAME "thunderx" | |||
| #define CORENAME "THUNDERX" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_THUNDERX2T99 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "THUNDERX2T99" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTHUNDERX2T99 " \ | |||
| "-DL1_CODE_SIZE=32768 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=33554432 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" | |||
| #define LIBNAME "thunderx2t99" | |||
| #define CORENAME "THUNDERX2T99" | |||
| #else | |||
| #endif | |||
| #ifndef FORCE | |||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
| @@ -75,14 +75,29 @@ SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| @@ -0,0 +1,6 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||
| SDOTKERNEL=dot-thunderx.c | |||
| DDOTKERNEL=ddot-thunderx.c | |||
| DAXPYKERNEL=daxpy-thunderx.c | |||
| @@ -0,0 +1,2 @@ | |||
| include $(KERNELDIR)/KERNEL.VULCAN | |||
| @@ -0,0 +1,4 @@ | |||
| include $(KERNELDIR)/KERNEL.CORTEXA57 | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_vulcan.S | |||
| @@ -0,0 +1,151 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||
| //#define prefetch(a) | |||
| static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| double a = *alpha; | |||
| #if 0 | |||
| prefetch(x + 128/sizeof(*x)); | |||
| prefetch(y + 128/sizeof(*y)); | |||
| #endif | |||
| prefetch(x + 2*128/sizeof(*x)); | |||
| prefetch(y + 2*128/sizeof(*y)); | |||
| prefetch(x + 3*128/sizeof(*x)); | |||
| prefetch(y + 3*128/sizeof(*y)); | |||
| prefetch(x + 4*128/sizeof(*x)); | |||
| prefetch(y + 4*128/sizeof(*y)); | |||
| while(i < n) | |||
| { | |||
| double y0, y1, y2, y3; | |||
| double y4, y5, y6, y7; | |||
| double *xx; | |||
| double *yy; | |||
| y0 = a * x[0] + y[0]; | |||
| y1 = a * x[1] + y[1]; | |||
| y2 = a * x[2] + y[2]; | |||
| y3 = a * x[3] + y[3]; | |||
| y4 = a * x[4] + y[4]; | |||
| y5 = a * x[5] + y[5]; | |||
| y6 = a * x[6] + y[6]; | |||
| y7 = a * x[7] + y[7]; | |||
| asm("":"+w"(y0),"+w"(y1),"+w"(y2),"+w"(y3),"+w"(y4),"+w"(y5),"+w"(y6),"+w"(y7)); | |||
| y[0] = y0; | |||
| y[1] = y1; | |||
| y[2] = y2; | |||
| y[3] = y3; | |||
| y[4] = y4; | |||
| y[5] = y5; | |||
| y[6] = y6; | |||
| y[7] = y7; | |||
| xx = (x + 4*128/sizeof(*x)); | |||
| yy = (y + 4*128/sizeof(*y)); | |||
| asm("":"+r"(yy)::"memory"); | |||
| prefetch(xx); | |||
| prefetch(yy); | |||
| y += 8; | |||
| x += 8; | |||
| i += 8 ; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -32; | |||
| if ( n1 ) | |||
| daxpy_kernel_8(n1, x, y , &da ); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| y[i] += da * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| BLASLONG n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| FLOAT m1 = da * x[ix] ; | |||
| FLOAT m2 = da * x[ix+inc_x] ; | |||
| FLOAT m3 = da * x[ix+2*inc_x] ; | |||
| FLOAT m4 = da * x[ix+3*inc_x] ; | |||
| y[iy] += m1 ; | |||
| y[iy+inc_y] += m2 ; | |||
| y[iy+2*inc_y] += m3 ; | |||
| y[iy+3*inc_y] += m4 ; | |||
| ix += inc_x*4 ; | |||
| iy += inc_y*4 ; | |||
| i+=4 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| y[iy] += da * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,119 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <arm_neon.h> | |||
| #define prefetch(a) __asm__("prfm PLDL1STRM, [%0]"::"r"(a):"memory"); | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT dot = 0.0 ; | |||
| if ( n < 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| float64x2_t vdot0 = {0.0, 0.0}; | |||
| float64x2_t vdot1 = {0.0, 0.0}; | |||
| float64x2_t vdot2 = {0.0, 0.0}; | |||
| float64x2_t vdot3 = {0.0, 0.0}; | |||
| float64x2_t *vx = (float64x2_t*)x; | |||
| float64x2_t *vy = (float64x2_t*)y; | |||
| #if 0 | |||
| prefetch(x + 128/sizeof(*x)); | |||
| prefetch(y + 128/sizeof(*y)); | |||
| #endif | |||
| prefetch(x + 2*128/sizeof(*x)); | |||
| prefetch(y + 2*128/sizeof(*y)); | |||
| prefetch(x + 3*128/sizeof(*x)); | |||
| prefetch(y + 3*128/sizeof(*y)); | |||
| int n1 = n&-8; | |||
| while(i < n1) | |||
| { | |||
| #if 0 | |||
| vdot0 = vfmaq_f64 (vdot0, | |||
| vy[0], | |||
| vx[0]); | |||
| vdot1 = vfmaq_f64 (vdot1, | |||
| vy[1], | |||
| vx[1]); | |||
| vdot2 = vfmaq_f64 (vdot2, | |||
| vy[2], | |||
| vx[2]); | |||
| vdot3 = vfmaq_f64 (vdot3, | |||
| vy[3], | |||
| vx[3]); | |||
| #else | |||
| vdot0 = vy[0] * vx[0] + vdot0; | |||
| vdot1 = vy[1] * vx[1] + vdot1; | |||
| vdot2 = vy[2] * vx[2] + vdot2; | |||
| vdot3 = vy[3] * vx[3] + vdot3; | |||
| #endif | |||
| vy += 4; | |||
| vx += 4; | |||
| i += 8; | |||
| prefetch(vx + 3*128/sizeof(*x)); | |||
| prefetch(vy + 3*128/sizeof(*y)); | |||
| } | |||
| dot = vaddvq_f64 (vdot0 + vdot1); | |||
| dot += vaddvq_f64 (vdot2 + vdot3); | |||
| i = n1; | |||
| while(i < n) | |||
| { | |||
| dot += y[i] * x[i] ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| while(i < n) | |||
| { | |||
| dot += y[iy] * x[ix] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -0,0 +1,340 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define I x9 | |||
| #define J x10 | |||
| #define TEMP1 x11 | |||
| #define TEMP2 x12 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v10.d[0], v0.d[1] | |||
| ins v12.d[0], v1.d[0] | |||
| ins v14.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v10.d[1], v2.d[1] | |||
| ins v12.d[1], v3.d[0] | |||
| ins v14.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v9.d[0], v4.d[0] | |||
| ins v11.d[0], v4.d[1] | |||
| ins v13.d[0], v5.d[0] | |||
| ins v15.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v9.d[1], v6.d[0] | |||
| ins v11.d[1], v6.d[1] | |||
| ins v13.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v9.d[0], v0.d[1] | |||
| ins v10.d[0], v1.d[0] | |||
| ins v11.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v9.d[1], v2.d[1] | |||
| ins v10.d[1], v3.d[0] | |||
| ins v11.d[1], v3.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B00] | |||
| add B00, B00, #16 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| stp q0, q1, [B00], #32 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B00], #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L4_BEGIN: | |||
| asr J, N, #2 // J = N / 4 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_20 | |||
| dgemm_ncopy_L4_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L4_M4_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M4_60 | |||
| dgemm_ncopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_20 | |||
| dgemm_ncopy_L2_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L2_M4_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M4_60 | |||
| dgemm_ncopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M4_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_20 | |||
| dgemm_ncopy_L1_M4_40: | |||
| and I, M , #3 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M4_END | |||
| .align 5 | |||
| dgemm_ncopy_L1_M4_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M4_60 | |||
| dgemm_ncopy_L1_M4_END: | |||
| dgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,544 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define A05 x9 | |||
| #define A06 x10 | |||
| #define A07 x11 | |||
| #define A08 x12 | |||
| #define I x13 | |||
| #define J x14 | |||
| #define TEMP1 x15 | |||
| #define TEMP2 x16 | |||
| #define A_PREFETCH 2560 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| COPY4x8 | |||
| COPY4x8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| ldp q0, q1, [A01], #32 | |||
| ins v16.d[0], v0.d[0] | |||
| ins v20.d[0], v0.d[1] | |||
| ins v24.d[0], v1.d[0] | |||
| ins v28.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v16.d[1], v2.d[0] | |||
| ins v20.d[1], v2.d[1] | |||
| ins v24.d[1], v3.d[0] | |||
| ins v28.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v17.d[0], v4.d[0] | |||
| ins v21.d[0], v4.d[1] | |||
| ins v25.d[0], v5.d[0] | |||
| ins v29.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v17.d[1], v6.d[0] | |||
| ins v21.d[1], v6.d[1] | |||
| ins v25.d[1], v7.d[0] | |||
| ins v29.d[1], v7.d[1] | |||
| ldp q8, q9, [A05], #32 | |||
| ins v18.d[0], v8.d[0] | |||
| ins v22.d[0], v8.d[1] | |||
| ins v26.d[0], v9.d[0] | |||
| ins v30.d[0], v9.d[1] | |||
| ldp q10, q11, [A06], #32 | |||
| ins v18.d[1], v10.d[0] | |||
| ins v22.d[1], v10.d[1] | |||
| ins v26.d[1], v11.d[0] | |||
| ins v30.d[1], v11.d[1] | |||
| ldp q12, q13, [A07], #32 | |||
| ins v19.d[0], v12.d[0] | |||
| ins v23.d[0], v12.d[1] | |||
| ins v27.d[0], v13.d[0] | |||
| ins v31.d[0], v13.d[1] | |||
| ldp q14, q15, [A08], #32 | |||
| ins v19.d[1], v14.d[0] | |||
| ins v23.d[1], v14.d[1] | |||
| ins v27.d[1], v15.d[0] | |||
| ins v31.d[1], v15.d[1] | |||
| st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v10.d[0], v0.d[1] | |||
| ins v12.d[0], v1.d[0] | |||
| ins v14.d[0], v1.d[1] | |||
| ldp q2, q3, [A02], #32 | |||
| ins v8.d[1], v2.d[0] | |||
| ins v10.d[1], v2.d[1] | |||
| ins v12.d[1], v3.d[0] | |||
| ins v14.d[1], v3.d[1] | |||
| ldp q4, q5, [A03], #32 | |||
| ins v9.d[0], v4.d[0] | |||
| ins v11.d[0], v4.d[1] | |||
| ins v13.d[0], v5.d[0] | |||
| ins v15.d[0], v5.d[1] | |||
| ldp q6, q7, [A04], #32 | |||
| ins v9.d[1], v6.d[0] | |||
| ins v11.d[1], v6.d[1] | |||
| ins v13.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| ldp q16, q17, [A01], #32 | |||
| ins v24.d[0], v16.d[0] | |||
| ins v26.d[0], v16.d[1] | |||
| ins v28.d[0], v17.d[0] | |||
| ins v30.d[0], v17.d[1] | |||
| ldp q18, q19, [A02], #32 | |||
| ins v24.d[1], v18.d[0] | |||
| ins v26.d[1], v18.d[1] | |||
| ins v28.d[1], v19.d[0] | |||
| ins v30.d[1], v19.d[1] | |||
| ldp q20, q21, [A03], #32 | |||
| ins v25.d[0], v20.d[0] | |||
| ins v27.d[0], v20.d[1] | |||
| ins v29.d[0], v21.d[0] | |||
| ins v31.d[0], v21.d[1] | |||
| ldp q22, q23, [A04], #32 | |||
| ins v25.d[1], v22.d[0] | |||
| ins v27.d[1], v22.d[1] | |||
| ins v29.d[1], v23.d[0] | |||
| ins v31.d[1], v23.d[1] | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B00] | |||
| add B00, B00, #32 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| ins v8.d[0], v0.d[0] | |||
| ins v9.d[0], v0.d[1] | |||
| ins v10.d[0], v1.d[0] | |||
| ins v11.d[0], v1.d[1] | |||
| ins v12.d[0], v2.d[0] | |||
| ins v13.d[0], v2.d[1] | |||
| ins v14.d[0], v3.d[0] | |||
| ins v15.d[0], v3.d[1] | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| ins v8.d[1], v4.d[0] | |||
| ins v9.d[1], v4.d[1] | |||
| ins v10.d[1], v5.d[0] | |||
| ins v11.d[1], v5.d[1] | |||
| ins v12.d[1], v6.d[0] | |||
| ins v13.d[1], v6.d[1] | |||
| ins v14.d[1], v7.d[0] | |||
| ins v15.d[1], v7.d[1] | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B00] | |||
| add B00, B00, #64 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B00] | |||
| add B00, B00, #64 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B00] | |||
| add B00, B00, #16 | |||
| .endm | |||
| /*************************************************************************************/ | |||
| .macro COPY8x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| stp q0, q1, [B00], #32 | |||
| stp q2, q3, [B00], #32 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B00], #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| dgemm_ncopy_L8_BEGIN: | |||
| asr J, N, #3 // J = N / 8 | |||
| cmp J, #0 | |||
| ble dgemm_ncopy_L4_BEGIN | |||
| dgemm_ncopy_L8_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A00, A08, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_40 | |||
| dgemm_ncopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_20 | |||
| dgemm_ncopy_L8_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L8_M8_END | |||
| dgemm_ncopy_L8_M8_60: | |||
| COPY1x8 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L8_M8_60 | |||
| dgemm_ncopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_ncopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L4_BEGIN: | |||
| tst N, #7 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #4 | |||
| ble dgemm_ncopy_L2_BEGIN | |||
| dgemm_ncopy_L4_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_40 | |||
| dgemm_ncopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_20 | |||
| dgemm_ncopy_L4_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L4_M8_END | |||
| dgemm_ncopy_L4_M8_60: | |||
| COPY1x4 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L4_M8_60 | |||
| dgemm_ncopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L2_BEGIN: | |||
| tst N, #3 | |||
| ble dgemm_ncopy_L999 | |||
| tst N, #2 | |||
| ble dgemm_ncopy_L1_BEGIN | |||
| dgemm_ncopy_L2_M8_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_40 | |||
| dgemm_ncopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_20 | |||
| dgemm_ncopy_L2_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L2_M8_END | |||
| dgemm_ncopy_L2_M8_60: | |||
| COPY1x2 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L2_M8_60 | |||
| dgemm_ncopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble dgemm_ncopy_L999 | |||
| dgemm_ncopy_L1_M8_BEGIN: | |||
| mov A01, A00 | |||
| asr I, M, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_40 | |||
| dgemm_ncopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_20 | |||
| dgemm_ncopy_L1_M8_40: | |||
| and I, M , #7 | |||
| cmp I, #0 | |||
| ble dgemm_ncopy_L1_M8_END | |||
| dgemm_ncopy_L1_M8_60: | |||
| COPY1x1 | |||
| subs I , I , #1 | |||
| bne dgemm_ncopy_L1_M8_60 | |||
| dgemm_ncopy_L1_M8_END: | |||
| dgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,402 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M4 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define B01 x10 | |||
| #define B02 x11 | |||
| #define B03 x12 | |||
| #define B04 x13 | |||
| #define I x14 | |||
| #define J x15 | |||
| #define TEMP1 x16 | |||
| #define TEMP2 x17 | |||
| #define A_PREFETCH 2560 | |||
| #define B_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B03] | |||
| add B03, B03, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| stp q0, q1, [B02] | |||
| add B02, B02, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| stp d0, d1, [B03] | |||
| add B03, B03, #16 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ////prfm PLDL1KEEP, [B01, #B_PREFETCH] | |||
| stp q0, q1, [B01] | |||
| add B01, B01, M4 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ////prfm PLDL1KEEP, [B02, #B_PREFETCH] | |||
| str q0, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ////prfm PLDL1KEEP, [B03, #B_PREFETCH] | |||
| str d0, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #3 // x12 = M * SIZE | |||
| and B02 , N , #-4 | |||
| and B03 , N , #-2 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| lsl M4, M, #5 // M4 = M * 4 * SIZE | |||
| dgemm_tcopy_L4_BEGIN: | |||
| asr J, M, #2 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B01, B | |||
| add B, B01, #128 // B = B + 16 * SIZE | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M4_20 | |||
| dgemm_tcopy_L4_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M4_60 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M4_60: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M4_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L4_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M4_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B01, B | |||
| add B, B01, #64 // B = B + 8 * SIZE | |||
| asr I, N, #2 // I = N / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M4_20 | |||
| dgemm_tcopy_L2_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M4_60 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M4_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M4_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M4_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M4_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M4_20: | |||
| COPY4x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M4_20 | |||
| dgemm_tcopy_L1_M4_40: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M4_60 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M4_60: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M4_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M4_END: | |||
| dgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,682 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M8 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define A05 x10 | |||
| #define A06 x11 | |||
| #define A07 x12 | |||
| #define A08 x13 | |||
| #define B01 x14 | |||
| #define B02 x15 | |||
| #define B03 x16 | |||
| #define B04 x17 | |||
| #define I x18 | |||
| #define J x19 | |||
| #define TEMP1 x20 | |||
| #define TEMP2 x21 | |||
| #define A_PREFETCH 2560 | |||
| #define B_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A03], #32 | |||
| ldp q10, q11, [A03], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A04], #32 | |||
| ldp q14, q15, [A04], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q16, q17, [A05], #32 | |||
| ldp q18, q19, [A05], #32 | |||
| st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q20, q21, [A06], #32 | |||
| ldp q22, q23, [A06], #32 | |||
| st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q24, q25, [A07], #32 | |||
| ldp q26, q27, [A07], #32 | |||
| st1 {v24.2d, v25.2d, v26.2d, v27.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q28, q29, [A08], #32 | |||
| ldp q30, q31, [A08], #32 | |||
| st1 {v28.2d, v29.2d, v30.2d, v31.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q8, q9, [A05], #32 | |||
| ldp q10, q11, [A06], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q12, q13, [A07], #32 | |||
| ldp q14, q15, [A08], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
| add B03, B03, #64 | |||
| ldr q4, [A05], #16 | |||
| ldr q5, [A06], #16 | |||
| ldr q6, [A07], #16 | |||
| ldr q7, [A08], #16 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B03] | |||
| add B03, B03, #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
| add B04, B04, #32 | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| st1 {v4.1d, v5.1d, v6.1d, v7.1d}, [B04] | |||
| add B04, B04, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A03], #32 | |||
| ldp q10, q11, [A03], #32 | |||
| st1 {v8.2d, v9.2d, v10.2d, v11.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A04], #32 | |||
| ldp q14, q15, [A04], #32 | |||
| st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| ldp q4, q5, [A03], #32 | |||
| ldp q6, q7, [A04], #32 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B03] | |||
| add B03, B03, #64 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [B04] | |||
| add B04, B04, #32 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| ldp q4, q5, [A02], #32 | |||
| ldp q6, q7, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B01] | |||
| add TEMP1, B01, #64 | |||
| st1 {v4.2d, v5.2d, v6.2d, v7.2d}, [TEMP1] | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A02], #32 | |||
| st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [B02] | |||
| add B02, B02, #64 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| stp q0, q1, [B03] | |||
| add B03, B03, #32 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| stp d0, d1, [B04] | |||
| add B04, B04, #16 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| ldp q2, q3, [A01], #32 | |||
| stp q0, q1, [B01] | |||
| add TEMP1, B01, #32 | |||
| stp q2, q3, [TEMP1] | |||
| add B01, B01, M8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01], #32 | |||
| stp q0, q1, [B02] | |||
| add B02, B02, #32 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01], #16 | |||
| str q0, [B03] | |||
| add B03, B03, #16 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01], #8 | |||
| str d0, [B04] | |||
| add B04, B04, #8 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #3 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #3 // TEMP1 = M * SIZE | |||
| and B02 , N , #-8 | |||
| and B03 , N , #-4 | |||
| and B04 , N , #-2 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| mul B04, B04, TEMP1 | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| add B04 , B04, B | |||
| lsl M8, M, #6 // M8 = M * 8 * SIZE | |||
| dgemm_tcopy_L8_BEGIN: | |||
| asr J, M, #3 // J = M / 4 | |||
| cmp J, #0 | |||
| ble dgemm_tcopy_L4_BEGIN | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A, A08, LDA | |||
| mov B01, B | |||
| add B, B01, #512 // B = B + 64 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L8_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L8_M8_20 | |||
| dgemm_tcopy_L8_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L8_M8_60 | |||
| COPY4x8 | |||
| dgemm_tcopy_L8_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L8_M8_80 | |||
| COPY2x8 | |||
| dgemm_tcopy_L8_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L8_M8_END | |||
| COPY1x8 | |||
| dgemm_tcopy_L8_M8_END: | |||
| subs J , J, #1 // j-- | |||
| bne dgemm_tcopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L4_BEGIN: | |||
| tst M, #7 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #4 | |||
| ble dgemm_tcopy_L2_BEGIN | |||
| dgemm_tcopy_L4_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B01, B | |||
| add B, B01, #256 // B = B + 32 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L4_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L4_M8_20 | |||
| dgemm_tcopy_L4_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L4_M8_60 | |||
| COPY4x4 | |||
| dgemm_tcopy_L4_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L4_M8_80 | |||
| COPY2x4 | |||
| dgemm_tcopy_L4_M8_80: | |||
| tst N, #1 | |||
| ble dgemm_tcopy_L4_M8_END | |||
| COPY1x4 | |||
| dgemm_tcopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble dgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble dgemm_tcopy_L1_BEGIN | |||
| dgemm_tcopy_L2_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B01, B | |||
| add B, B01, #128 // B = B + 16 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L2_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L2_M8_20 | |||
| dgemm_tcopy_L2_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L2_M8_60 | |||
| COPY4x2 | |||
| dgemm_tcopy_L2_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L2_M8_80 | |||
| COPY2x2 | |||
| dgemm_tcopy_L2_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L2_M8_END | |||
| COPY1x2 | |||
| dgemm_tcopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| dgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble dgemm_tcopy_L999 | |||
| dgemm_tcopy_L1_M8_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B01, B | |||
| asr I, N, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble dgemm_tcopy_L1_M8_40 | |||
| .align 5 | |||
| dgemm_tcopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne dgemm_tcopy_L1_M8_20 | |||
| dgemm_tcopy_L1_M8_40: | |||
| tst N , #4 | |||
| ble dgemm_tcopy_L1_M8_60 | |||
| COPY4x1 | |||
| dgemm_tcopy_L1_M8_60: | |||
| tst N , #2 | |||
| ble dgemm_tcopy_L1_M8_80 | |||
| COPY2x1 | |||
| dgemm_tcopy_L1_M8_80: | |||
| tst N , #1 | |||
| ble dgemm_tcopy_L1_M8_END | |||
| COPY1x1 | |||
| dgemm_tcopy_L1_M8_END: | |||
| dgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,104 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(DSDOT) | |||
| double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #else | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| #endif | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| #if defined(DSDOT) | |||
| double dot = 0.0 ; | |||
| #else | |||
| FLOAT dot = 0.0 ; | |||
| #endif | |||
| if ( n < 0 ) return(dot); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -4; | |||
| while(i < n1) | |||
| { | |||
| #if defined(DSDOT) | |||
| dot += (double) y[i] * (double) x[i] | |||
| + (double) y[i+1] * (double) x[i+1] | |||
| + (double) y[i+2] * (double) x[i+2] | |||
| + (double) y[i+3] * (double) x[i+3] ; | |||
| #else | |||
| dot += y[i] * x[i] | |||
| + y[i+1] * x[i+1] | |||
| + y[i+2] * x[i+2] | |||
| + y[i+3] * x[i+3] ; | |||
| #endif | |||
| i+=4 ; | |||
| } | |||
| while(i < n) | |||
| { | |||
| #if defined(DSDOT) | |||
| dot += (double) y[i] * (double) x[i] ; | |||
| #else | |||
| dot += y[i] * x[i] ; | |||
| #endif | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| while(i < n) | |||
| { | |||
| #if defined(DSDOT) | |||
| dot += (double) y[iy] * (double) x[ix] ; | |||
| #else | |||
| dot += y[iy] * x[ix] ; | |||
| #endif | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| return(dot); | |||
| } | |||
| @@ -2303,6 +2303,44 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined(VULCAN) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_P dgemm_p | |||
| #define CGEMM_DEFAULT_P 256 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_Q 1024 | |||
| #define DGEMM_DEFAULT_Q dgemm_q | |||
| #define CGEMM_DEFAULT_Q 512 | |||
| #define ZGEMM_DEFAULT_Q 512 | |||
| #define SGEMM_DEFAULT_R 4096 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 2048 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| @@ -2385,6 +2423,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined(THUNDERX) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 4 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 2 | |||
| #define DGEMM_DEFAULT_UNROLL_N 2 | |||
| #define CGEMM_DEFAULT_UNROLL_M 2 | |||
| #define CGEMM_DEFAULT_UNROLL_N 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 2 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 2 | |||
| #define SGEMM_DEFAULT_P 128 | |||
| #define DGEMM_DEFAULT_P 128 | |||
| #define CGEMM_DEFAULT_P 96 | |||
| #define ZGEMM_DEFAULT_P 64 | |||
| #define SGEMM_DEFAULT_Q 240 | |||
| #define DGEMM_DEFAULT_Q 120 | |||
| #define CGEMM_DEFAULT_Q 120 | |||
| #define ZGEMM_DEFAULT_Q 120 | |||
| #define SGEMM_DEFAULT_R 12288 | |||
| #define DGEMM_DEFAULT_R 8192 | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 4096 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined(THUNDERX2T99) | |||
| #define SNUMOPT 2 | |||
| #define DNUMOPT 2 | |||
| #define GEMM_DEFAULT_OFFSET_A 0 | |||
| #define GEMM_DEFAULT_OFFSET_B 0 | |||
| #define GEMM_DEFAULT_ALIGN 0x03fffUL | |||
| #define SGEMM_DEFAULT_UNROLL_M 16 | |||
| #define SGEMM_DEFAULT_UNROLL_N 4 | |||
| #define DGEMM_DEFAULT_UNROLL_M 8 | |||
| #define DGEMM_DEFAULT_UNROLL_N 4 | |||
| #define CGEMM_DEFAULT_UNROLL_M 8 | |||
| #define CGEMM_DEFAULT_UNROLL_N 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_M 4 | |||
| #define ZGEMM_DEFAULT_UNROLL_N 4 | |||
| #define SGEMM_DEFAULT_P 512 | |||
| #define DGEMM_DEFAULT_P dgemm_p | |||
| #define CGEMM_DEFAULT_P 256 | |||
| #define ZGEMM_DEFAULT_P 128 | |||
| #define SGEMM_DEFAULT_Q 1024 | |||
| #define DGEMM_DEFAULT_Q dgemm_q | |||
| #define CGEMM_DEFAULT_Q 512 | |||
| #define ZGEMM_DEFAULT_Q 512 | |||
| #define SGEMM_DEFAULT_R 4096 | |||
| #define DGEMM_DEFAULT_R dgemm_r | |||
| #define CGEMM_DEFAULT_R 4096 | |||
| #define ZGEMM_DEFAULT_R 2048 | |||
| #define SYMV_P 16 | |||
| #endif | |||
| #if defined(ARMV5) | |||
| #define SNUMOPT 2 | |||