| @@ -2,6 +2,10 @@ ZGEMVNKERNEL = zgemv_n_dup.S | |||||
| ZGEMVTKERNEL = zgemv_t_dup.S | ZGEMVTKERNEL = zgemv_t_dup.S | ||||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | DGEMVNKERNEL = dgemv_n_bulldozer.S | ||||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||||
| DAXPYKERNEL = daxpy_bulldozer.S | |||||
| DDOTKERNEL = ddot_bulldozer.S | |||||
| DCOPYKERNEL = dcopy_bulldozer.S | |||||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | SGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| @@ -0,0 +1,408 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #ifndef WINDOWS_ABI | |||||
| #define M ARG1 | |||||
| #define X ARG4 | |||||
| #define INCX ARG5 | |||||
| #define Y ARG6 | |||||
| #define INCY ARG2 | |||||
| #else | |||||
| #define M ARG1 | |||||
| #define X ARG2 | |||||
| #define INCX ARG3 | |||||
| #define Y ARG4 | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #define YY %r11 | |||||
| #define ALPHA %xmm15 | |||||
| #define A_PRE 640 | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifndef WINDOWS_ABI | |||||
| #ifndef XDOUBLE | |||||
| movq 8(%rsp), INCY | |||||
| #else | |||||
| movq 24(%rsp), INCY | |||||
| #endif | |||||
| vmovups %xmm0, ALPHA | |||||
| #else | |||||
| vmovups %xmm3, ALPHA | |||||
| movq 40(%rsp), X | |||||
| movq 48(%rsp), INCX | |||||
| movq 56(%rsp), Y | |||||
| movq 64(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| unpcklpd ALPHA, ALPHA | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| testq M, M | |||||
| jle .L47 | |||||
| cmpq $SIZE, INCX | |||||
| jne .L40 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L40 | |||||
| testq $SIZE, Y | |||||
| je .L10 | |||||
| movsd (X), %xmm0 | |||||
| mulsd ALPHA, %xmm0 | |||||
| addsd (Y), %xmm0 | |||||
| movsd %xmm0, (Y) | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq M | |||||
| jle .L19 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| movq M, %rax | |||||
| sarq $4, %rax | |||||
| jle .L13 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_3 | |||||
| .L11: | |||||
| prefetchnta A_PRE(Y) | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| prefetchnta A_PRE(X) | |||||
| nop | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| prefetchnta A_PRE+64(Y) | |||||
| vmovups 0 * SIZE(X), %xmm0 | |||||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||||
| vmovups 2 * SIZE(X), %xmm1 | |||||
| vmovups 4 * SIZE(X), %xmm2 | |||||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||||
| vmovups 6 * SIZE(X), %xmm3 | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| prefetchnta A_PRE+64(X) | |||||
| nop | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| ALIGN_3 | |||||
| .L13: | |||||
| movq M, %rax | |||||
| andq $8, %rax | |||||
| jle .L14 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| movq M, %rax | |||||
| andq $4, %rax | |||||
| jle .L15 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| movq M, %rax | |||||
| andq $2, %rax | |||||
| jle .L16 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| movq M, %rax | |||||
| andq $1, %rax | |||||
| jle .L19 | |||||
| ALIGN_3 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||||
| vmovsd %xmm0, -16 * SIZE(Y) | |||||
| ALIGN_3 | |||||
| .L19: | |||||
| xorq %rax,%rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| ALIGN_3 | |||||
| .L40: | |||||
| movq Y, YY | |||||
| movq M, %rax | |||||
| //If incx==0 || incy==0, avoid unloop. | |||||
| cmpq $0, INCX | |||||
| je .L46 | |||||
| cmpq $0, INCY | |||||
| je .L46 | |||||
| sarq $3, %rax | |||||
| jle .L45 | |||||
| prefetchnta 512(X) | |||||
| prefetchnta 512+64(X) | |||||
| prefetchnta 512+128(X) | |||||
| prefetchnta 512+192(X) | |||||
| prefetchnta 512(Y) | |||||
| prefetchnta 512+64(Y) | |||||
| prefetchnta 512+128(Y) | |||||
| prefetchnta 512+192(Y) | |||||
| ALIGN_3 | |||||
| .L41: | |||||
| vmovsd 0 * SIZE(X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm0 , %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm6 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6 | |||||
| addq INCY, YY | |||||
| vmovsd 0 * SIZE(X), %xmm1 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm1 , %xmm1 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm7 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7 | |||||
| addq INCY, YY | |||||
| vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0 | |||||
| vmovsd 0 * SIZE(X), %xmm2 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm2 , %xmm2 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(YY), %xmm8 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8 | |||||
| addq INCY, YY | |||||
| vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1 | |||||
| vmovsd 0 * SIZE(X), %xmm3 | |||||
| addq INCX, X | |||||
| vmovhpd 0 * SIZE(X), %xmm3 , %xmm3 | |||||
| addq INCX, X | |||||
| vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2 | |||||
| vmovsd 0 * SIZE(YY), %xmm9 | |||||
| addq INCY, YY | |||||
| vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9 | |||||
| addq INCY, YY | |||||
| vmovsd %xmm0, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm0, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm1, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm1, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm2, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm2, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3 | |||||
| vmovsd %xmm3, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| vmovhpd %xmm3, 0 * SIZE(Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L41 | |||||
| ALIGN_3 | |||||
| .L45: | |||||
| movq M, %rax | |||||
| andq $7, %rax | |||||
| jle .L47 | |||||
| ALIGN_3 | |||||
| .L46: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0 | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L46 | |||||
| ALIGN_3 | |||||
| .L47: | |||||
| xorq %rax, %rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,291 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define M ARG1 /* rdi */ | |||||
| #define X ARG2 /* rsi */ | |||||
| #define INCX ARG3 /* rdx */ | |||||
| #define Y ARG4 /* rcx */ | |||||
| #ifndef WINDOWS_ABI | |||||
| #define INCY ARG5 /* r8 */ | |||||
| #else | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #include "l1param.h" | |||||
| #define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG | |||||
| #define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2 | |||||
| #define A_PRE 640 | |||||
| #define B_PRE 640 | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| movq 40(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| cmpq $SIZE, INCX | |||||
| jne .L40 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L40 | |||||
| testq $SIZE, X | |||||
| je .L10 | |||||
| vmovsd (X), %xmm0 | |||||
| vmovsd %xmm0, (Y) | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq M | |||||
| jle .L19 | |||||
| ALIGN_4 | |||||
| .L10: | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| movq M, %rax | |||||
| sarq $4, %rax | |||||
| jle .L13 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vmovups -8 * SIZE(X), %xmm4 | |||||
| vmovups -6 * SIZE(X), %xmm5 | |||||
| vmovups -4 * SIZE(X), %xmm6 | |||||
| vmovups -2 * SIZE(X), %xmm7 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_4 | |||||
| .L11: | |||||
| prefetchnta A_PRE(X) | |||||
| nop | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| prefetchnta B_PRE(Y) | |||||
| nop | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| VLOAD( 0 * SIZE, X, %xmm0) | |||||
| VLOAD( 2 * SIZE, X, %xmm1) | |||||
| VLOAD( 4 * SIZE, X, %xmm2) | |||||
| VLOAD( 6 * SIZE, X, %xmm3) | |||||
| prefetchnta A_PRE+64(X) | |||||
| nop | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| prefetchnta B_PRE+64(Y) | |||||
| nop | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| VLOAD( 8 * SIZE, X, %xmm4) | |||||
| VLOAD(10 * SIZE, X, %xmm5) | |||||
| subq $-16 * SIZE, Y | |||||
| VLOAD(12 * SIZE, X, %xmm6) | |||||
| VLOAD(14 * SIZE, X, %xmm7) | |||||
| subq $-16 * SIZE, X | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| vmovups %xmm4, -8 * SIZE(Y) | |||||
| vmovups %xmm5, -6 * SIZE(Y) | |||||
| vmovups %xmm6, -4 * SIZE(Y) | |||||
| vmovups %xmm7, -2 * SIZE(Y) | |||||
| subq $-16 * SIZE, Y | |||||
| subq $-16 * SIZE, X | |||||
| ALIGN_3 | |||||
| .L13: | |||||
| testq $8, M | |||||
| jle .L14 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups -12 * SIZE(X), %xmm2 | |||||
| vmovups -10 * SIZE(X), %xmm3 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| vmovups %xmm2, -12 * SIZE(Y) | |||||
| vmovups %xmm3, -10 * SIZE(Y) | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| testq $4, M | |||||
| jle .L15 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups -14 * SIZE(X), %xmm1 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| vmovups %xmm1, -14 * SIZE(Y) | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| testq $2, M | |||||
| jle .L16 | |||||
| ALIGN_3 | |||||
| vmovups -16 * SIZE(X), %xmm0 | |||||
| vmovups %xmm0, -16 * SIZE(Y) | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| testq $1, M | |||||
| jle .L19 | |||||
| ALIGN_3 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vmovsd %xmm0, -16 * SIZE(Y) | |||||
| ALIGN_3 | |||||
| .L19: | |||||
| xorq %rax,%rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| ALIGN_3 | |||||
| .L40: | |||||
| movq M, %rax | |||||
| sarq $3, %rax | |||||
| jle .L45 | |||||
| ALIGN_3 | |||||
| .L41: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm1 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm2 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm3 | |||||
| addq INCX, X | |||||
| vmovsd (X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm4, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm1, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm5, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm2, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm6, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm3, (Y) | |||||
| addq INCY, Y | |||||
| vmovsd %xmm7, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L41 | |||||
| ALIGN_3 | |||||
| .L45: | |||||
| movq M, %rax | |||||
| andq $7, %rax | |||||
| jle .L47 | |||||
| ALIGN_3 | |||||
| .L46: | |||||
| vmovsd (X), %xmm0 | |||||
| addq INCX, X | |||||
| vmovsd %xmm0, (Y) | |||||
| addq INCY, Y | |||||
| decq %rax | |||||
| jg .L46 | |||||
| ALIGN_3 | |||||
| .L47: | |||||
| xorq %rax, %rax | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||
| @@ -0,0 +1,311 @@ | |||||
| /*********************************************************************/ | |||||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||||
| /* All rights reserved. */ | |||||
| /* */ | |||||
| /* Redistribution and use in source and binary forms, with or */ | |||||
| /* without modification, are permitted provided that the following */ | |||||
| /* conditions are met: */ | |||||
| /* */ | |||||
| /* 1. Redistributions of source code must retain the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer. */ | |||||
| /* */ | |||||
| /* 2. Redistributions in binary form must reproduce the above */ | |||||
| /* copyright notice, this list of conditions and the following */ | |||||
| /* disclaimer in the documentation and/or other materials */ | |||||
| /* provided with the distribution. */ | |||||
| /* */ | |||||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||||
| /* */ | |||||
| /* The views and conclusions contained in the software and */ | |||||
| /* documentation are those of the authors and should not be */ | |||||
| /* interpreted as representing official policies, either expressed */ | |||||
| /* or implied, of The University of Texas at Austin. */ | |||||
| /*********************************************************************/ | |||||
| #define ASSEMBLER | |||||
| #include "common.h" | |||||
| #define N ARG1 /* rdi */ | |||||
| #define X ARG2 /* rsi */ | |||||
| #define INCX ARG3 /* rdx */ | |||||
| #define Y ARG4 /* rcx */ | |||||
| #ifndef WINDOWS_ABI | |||||
| #define INCY ARG5 /* r8 */ | |||||
| #else | |||||
| #define INCY %r10 | |||||
| #endif | |||||
| #define A_PRE 512 | |||||
| #include "l1param.h" | |||||
| PROLOGUE | |||||
| PROFCODE | |||||
| #ifdef WINDOWS_ABI | |||||
| movq 40(%rsp), INCY | |||||
| #endif | |||||
| SAVEREGISTERS | |||||
| leaq (, INCX, SIZE), INCX | |||||
| leaq (, INCY, SIZE), INCY | |||||
| vxorps %xmm0, %xmm0 , %xmm0 | |||||
| vxorps %xmm1, %xmm1 , %xmm1 | |||||
| vxorps %xmm2, %xmm2 , %xmm2 | |||||
| vxorps %xmm3, %xmm3 , %xmm3 | |||||
| cmpq $0, N | |||||
| jle .L999 | |||||
| cmpq $SIZE, INCX | |||||
| jne .L50 | |||||
| cmpq $SIZE, INCY | |||||
| jne .L50 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| testq $SIZE, Y | |||||
| je .L10 | |||||
| vmovsd -16 * SIZE(X), %xmm0 | |||||
| vmulsd -16 * SIZE(Y), %xmm0 , %xmm0 | |||||
| addq $1 * SIZE, X | |||||
| addq $1 * SIZE, Y | |||||
| decq N | |||||
| ALIGN_2 | |||||
| .L10: | |||||
| movq N, %rax | |||||
| sarq $4, %rax | |||||
| jle .L14 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vmovups -12 * SIZE(X), %xmm6 | |||||
| vmovups -10 * SIZE(X), %xmm7 | |||||
| vmovups -8 * SIZE(X), %xmm8 | |||||
| vmovups -6 * SIZE(X), %xmm9 | |||||
| vmovups -4 * SIZE(X), %xmm10 | |||||
| vmovups -2 * SIZE(X), %xmm11 | |||||
| decq %rax | |||||
| jle .L12 | |||||
| ALIGN_3 | |||||
| .L11: | |||||
| prefetchnta A_PRE(Y) | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| prefetchnta A_PRE(X) | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| vmovups 0 * SIZE(X), %xmm4 | |||||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||||
| vmovups 2 * SIZE(X), %xmm5 | |||||
| vmovups 4 * SIZE(X), %xmm6 | |||||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||||
| vmovups 6 * SIZE(X), %xmm7 | |||||
| prefetchnta A_PRE+64(Y) | |||||
| vmovups 8 * SIZE(X), %xmm8 | |||||
| vmovups 10 * SIZE(X), %xmm9 | |||||
| prefetchnta A_PRE+64(X) | |||||
| vmovups 12 * SIZE(X), %xmm10 | |||||
| vmovups 14 * SIZE(X), %xmm11 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| decq %rax | |||||
| jg .L11 | |||||
| ALIGN_3 | |||||
| .L12: | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||||
| subq $-16 * SIZE, X | |||||
| subq $-16 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L14: | |||||
| testq $15, N | |||||
| jle .L999 | |||||
| testq $8, N | |||||
| jle .L15 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vmovups -12 * SIZE(X), %xmm6 | |||||
| vmovups -10 * SIZE(X), %xmm7 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||||
| addq $8 * SIZE, X | |||||
| addq $8 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L15: | |||||
| testq $4, N | |||||
| jle .L16 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vmovups -14 * SIZE(X), %xmm5 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||||
| addq $4 * SIZE, X | |||||
| addq $4 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L16: | |||||
| testq $2, N | |||||
| jle .L17 | |||||
| vmovups -16 * SIZE(X), %xmm4 | |||||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||||
| addq $2 * SIZE, X | |||||
| addq $2 * SIZE, Y | |||||
| ALIGN_3 | |||||
| .L17: | |||||
| testq $1, N | |||||
| jle .L999 | |||||
| vmovsd -16 * SIZE(X), %xmm4 | |||||
| vmovsd -16 * SIZE(Y), %xmm5 | |||||
| vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0 | |||||
| jmp .L999 | |||||
| ALIGN_3 | |||||
| .L50: | |||||
| movq N, %rax | |||||
| sarq $3, %rax | |||||
| jle .L55 | |||||
| ALIGN_3 | |||||
| .L53: | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm9 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm10 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm11 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm5 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm9 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm6 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm10 | |||||
| addq INCY, Y | |||||
| vmovsd 0 * SIZE(X), %xmm7 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm11 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||||
| decq %rax | |||||
| jg .L53 | |||||
| ALIGN_3 | |||||
| .L55: | |||||
| movq N, %rax | |||||
| andq $7, %rax | |||||
| jle .L999 | |||||
| ALIGN_3 | |||||
| .L56: | |||||
| vmovsd 0 * SIZE(X), %xmm4 | |||||
| addq INCX, X | |||||
| vmovsd 0 * SIZE(Y), %xmm8 | |||||
| addq INCY, Y | |||||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||||
| decq %rax | |||||
| jg .L56 | |||||
| ALIGN_3 | |||||
| .L999: | |||||
| vaddpd %xmm1, %xmm0 , %xmm0 | |||||
| vaddpd %xmm3, %xmm2 , %xmm2 | |||||
| vaddpd %xmm2, %xmm0 , %xmm0 | |||||
| vhaddpd %xmm0, %xmm0 , %xmm0 | |||||
| RESTOREREGISTERS | |||||
| ret | |||||
| EPILOGUE | |||||