| @@ -2,6 +2,10 @@ ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t_dup.S | |||
| DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
| DAXPYKERNEL = daxpy_bulldozer.S | |||
| DDOTKERNEL = ddot_bulldozer.S | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| @@ -0,0 +1,408 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #ifndef WINDOWS_ABI | |||
| #define M ARG1 | |||
| #define X ARG4 | |||
| #define INCX ARG5 | |||
| #define Y ARG6 | |||
| #define INCY ARG2 | |||
| #else | |||
| #define M ARG1 | |||
| #define X ARG2 | |||
| #define INCX ARG3 | |||
| #define Y ARG4 | |||
| #define INCY %r10 | |||
| #endif | |||
| #define YY %r11 | |||
| #define ALPHA %xmm15 | |||
| #define A_PRE 640 | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifndef WINDOWS_ABI | |||
| #ifndef XDOUBLE | |||
| movq 8(%rsp), INCY | |||
| #else | |||
| movq 24(%rsp), INCY | |||
| #endif | |||
| vmovups %xmm0, ALPHA | |||
| #else | |||
| vmovups %xmm3, ALPHA | |||
| movq 40(%rsp), X | |||
| movq 48(%rsp), INCX | |||
| movq 56(%rsp), Y | |||
| movq 64(%rsp), INCY | |||
| #endif | |||
| SAVEREGISTERS | |||
| unpcklpd ALPHA, ALPHA | |||
| leaq (, INCX, SIZE), INCX | |||
| leaq (, INCY, SIZE), INCY | |||
| testq M, M | |||
| jle .L47 | |||
| cmpq $SIZE, INCX | |||
| jne .L40 | |||
| cmpq $SIZE, INCY | |||
| jne .L40 | |||
| testq $SIZE, Y | |||
| je .L10 | |||
| movsd (X), %xmm0 | |||
| mulsd ALPHA, %xmm0 | |||
| addsd (Y), %xmm0 | |||
| movsd %xmm0, (Y) | |||
| addq $1 * SIZE, X | |||
| addq $1 * SIZE, Y | |||
| decq M | |||
| jle .L19 | |||
| ALIGN_4 | |||
| .L10: | |||
| subq $-16 * SIZE, X | |||
| subq $-16 * SIZE, Y | |||
| movq M, %rax | |||
| sarq $4, %rax | |||
| jle .L13 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vmovups -12 * SIZE(X), %xmm2 | |||
| vmovups -10 * SIZE(X), %xmm3 | |||
| decq %rax | |||
| jle .L12 | |||
| ALIGN_3 | |||
| .L11: | |||
| prefetchnta A_PRE(Y) | |||
| vmovups -8 * SIZE(X), %xmm4 | |||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||
| vmovups -6 * SIZE(X), %xmm5 | |||
| vmovups -4 * SIZE(X), %xmm6 | |||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||
| vmovups -2 * SIZE(X), %xmm7 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| prefetchnta A_PRE(X) | |||
| nop | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| prefetchnta A_PRE+64(Y) | |||
| vmovups 0 * SIZE(X), %xmm0 | |||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||
| vmovups 2 * SIZE(X), %xmm1 | |||
| vmovups 4 * SIZE(X), %xmm2 | |||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||
| vmovups 6 * SIZE(X), %xmm3 | |||
| vmovups %xmm4, -8 * SIZE(Y) | |||
| vmovups %xmm5, -6 * SIZE(Y) | |||
| prefetchnta A_PRE+64(X) | |||
| nop | |||
| vmovups %xmm6, -4 * SIZE(Y) | |||
| vmovups %xmm7, -2 * SIZE(Y) | |||
| subq $-16 * SIZE, Y | |||
| subq $-16 * SIZE, X | |||
| decq %rax | |||
| jg .L11 | |||
| ALIGN_3 | |||
| .L12: | |||
| vmovups -8 * SIZE(X), %xmm4 | |||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||
| vmovups -6 * SIZE(X), %xmm5 | |||
| vmovups -4 * SIZE(X), %xmm6 | |||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||
| vmovups -2 * SIZE(X), %xmm7 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| vfmaddpd -8 * SIZE(Y), ALPHA, %xmm4 , %xmm4 | |||
| vfmaddpd -6 * SIZE(Y), ALPHA, %xmm5 , %xmm5 | |||
| vfmaddpd -4 * SIZE(Y), ALPHA, %xmm6 , %xmm6 | |||
| vfmaddpd -2 * SIZE(Y), ALPHA, %xmm7 , %xmm7 | |||
| vmovups %xmm4, -8 * SIZE(Y) | |||
| vmovups %xmm5, -6 * SIZE(Y) | |||
| vmovups %xmm6, -4 * SIZE(Y) | |||
| vmovups %xmm7, -2 * SIZE(Y) | |||
| subq $-16 * SIZE, Y | |||
| subq $-16 * SIZE, X | |||
| ALIGN_3 | |||
| .L13: | |||
| movq M, %rax | |||
| andq $8, %rax | |||
| jle .L14 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vmovups -12 * SIZE(X), %xmm2 | |||
| vmovups -10 * SIZE(X), %xmm3 | |||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||
| vfmaddpd -12 * SIZE(Y), ALPHA, %xmm2 , %xmm2 | |||
| vfmaddpd -10 * SIZE(Y), ALPHA, %xmm3 , %xmm3 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| addq $8 * SIZE, X | |||
| addq $8 * SIZE, Y | |||
| ALIGN_3 | |||
| .L14: | |||
| movq M, %rax | |||
| andq $4, %rax | |||
| jle .L15 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vfmaddpd -14 * SIZE(Y), ALPHA, %xmm1 , %xmm1 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| addq $4 * SIZE, X | |||
| addq $4 * SIZE, Y | |||
| ALIGN_3 | |||
| .L15: | |||
| movq M, %rax | |||
| andq $2, %rax | |||
| jle .L16 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vfmaddpd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| addq $2 * SIZE, X | |||
| addq $2 * SIZE, Y | |||
| ALIGN_3 | |||
| .L16: | |||
| movq M, %rax | |||
| andq $1, %rax | |||
| jle .L19 | |||
| ALIGN_3 | |||
| vmovsd -16 * SIZE(X), %xmm0 | |||
| vfmaddsd -16 * SIZE(Y), ALPHA, %xmm0 , %xmm0 | |||
| vmovsd %xmm0, -16 * SIZE(Y) | |||
| ALIGN_3 | |||
| .L19: | |||
| xorq %rax,%rax | |||
| RESTOREREGISTERS | |||
| ret | |||
| ALIGN_3 | |||
| .L40: | |||
| movq Y, YY | |||
| movq M, %rax | |||
| //If incx==0 || incy==0, avoid unloop. | |||
| cmpq $0, INCX | |||
| je .L46 | |||
| cmpq $0, INCY | |||
| je .L46 | |||
| sarq $3, %rax | |||
| jle .L45 | |||
| prefetchnta 512(X) | |||
| prefetchnta 512+64(X) | |||
| prefetchnta 512+128(X) | |||
| prefetchnta 512+192(X) | |||
| prefetchnta 512(Y) | |||
| prefetchnta 512+64(Y) | |||
| prefetchnta 512+128(Y) | |||
| prefetchnta 512+192(Y) | |||
| ALIGN_3 | |||
| .L41: | |||
| vmovsd 0 * SIZE(X), %xmm0 | |||
| addq INCX, X | |||
| vmovhpd 0 * SIZE(X), %xmm0 , %xmm0 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(YY), %xmm6 | |||
| addq INCY, YY | |||
| vmovhpd 0 * SIZE(YY), %xmm6 , %xmm6 | |||
| addq INCY, YY | |||
| vmovsd 0 * SIZE(X), %xmm1 | |||
| addq INCX, X | |||
| vmovhpd 0 * SIZE(X), %xmm1 , %xmm1 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(YY), %xmm7 | |||
| addq INCY, YY | |||
| vmovhpd 0 * SIZE(YY), %xmm7 , %xmm7 | |||
| addq INCY, YY | |||
| vfmaddpd %xmm6 , ALPHA , %xmm0 , %xmm0 | |||
| vmovsd 0 * SIZE(X), %xmm2 | |||
| addq INCX, X | |||
| vmovhpd 0 * SIZE(X), %xmm2 , %xmm2 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(YY), %xmm8 | |||
| addq INCY, YY | |||
| vmovhpd 0 * SIZE(YY), %xmm8 , %xmm8 | |||
| addq INCY, YY | |||
| vfmaddpd %xmm7 , ALPHA , %xmm1 , %xmm1 | |||
| vmovsd 0 * SIZE(X), %xmm3 | |||
| addq INCX, X | |||
| vmovhpd 0 * SIZE(X), %xmm3 , %xmm3 | |||
| addq INCX, X | |||
| vfmaddpd %xmm8 , ALPHA , %xmm2 , %xmm2 | |||
| vmovsd 0 * SIZE(YY), %xmm9 | |||
| addq INCY, YY | |||
| vmovhpd 0 * SIZE(YY), %xmm9 , %xmm9 | |||
| addq INCY, YY | |||
| vmovsd %xmm0, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovhpd %xmm0, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm1, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovhpd %xmm1, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm2, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovhpd %xmm2, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vfmaddpd %xmm9 , ALPHA , %xmm3 , %xmm3 | |||
| vmovsd %xmm3, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| vmovhpd %xmm3, 0 * SIZE(Y) | |||
| addq INCY, Y | |||
| decq %rax | |||
| jg .L41 | |||
| ALIGN_3 | |||
| .L45: | |||
| movq M, %rax | |||
| andq $7, %rax | |||
| jle .L47 | |||
| ALIGN_3 | |||
| .L46: | |||
| vmovsd (X), %xmm0 | |||
| addq INCX, X | |||
| vfmaddsd (Y) , ALPHA , %xmm0 , %xmm0 | |||
| vmovsd %xmm0, (Y) | |||
| addq INCY, Y | |||
| decq %rax | |||
| jg .L46 | |||
| ALIGN_3 | |||
| .L47: | |||
| xorq %rax, %rax | |||
| RESTOREREGISTERS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,291 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M ARG1 /* rdi */ | |||
| #define X ARG2 /* rsi */ | |||
| #define INCX ARG3 /* rdx */ | |||
| #define Y ARG4 /* rcx */ | |||
| #ifndef WINDOWS_ABI | |||
| #define INCY ARG5 /* r8 */ | |||
| #else | |||
| #define INCY %r10 | |||
| #endif | |||
| #include "l1param.h" | |||
| #define VLOAD(OFFSET, ADDR, REG) vmovups OFFSET(ADDR), REG | |||
| #define VSHUFPD_1(REG1 , REG2) vshufpd $0x01, REG1, REG2, REG2 | |||
| #define A_PRE 640 | |||
| #define B_PRE 640 | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), INCY | |||
| #endif | |||
| SAVEREGISTERS | |||
| leaq (, INCX, SIZE), INCX | |||
| leaq (, INCY, SIZE), INCY | |||
| cmpq $SIZE, INCX | |||
| jne .L40 | |||
| cmpq $SIZE, INCY | |||
| jne .L40 | |||
| testq $SIZE, X | |||
| je .L10 | |||
| vmovsd (X), %xmm0 | |||
| vmovsd %xmm0, (Y) | |||
| addq $1 * SIZE, X | |||
| addq $1 * SIZE, Y | |||
| decq M | |||
| jle .L19 | |||
| ALIGN_4 | |||
| .L10: | |||
| subq $-16 * SIZE, X | |||
| subq $-16 * SIZE, Y | |||
| movq M, %rax | |||
| sarq $4, %rax | |||
| jle .L13 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vmovups -12 * SIZE(X), %xmm2 | |||
| vmovups -10 * SIZE(X), %xmm3 | |||
| vmovups -8 * SIZE(X), %xmm4 | |||
| vmovups -6 * SIZE(X), %xmm5 | |||
| vmovups -4 * SIZE(X), %xmm6 | |||
| vmovups -2 * SIZE(X), %xmm7 | |||
| decq %rax | |||
| jle .L12 | |||
| ALIGN_4 | |||
| .L11: | |||
| prefetchnta A_PRE(X) | |||
| nop | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| prefetchnta B_PRE(Y) | |||
| nop | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| VLOAD( 0 * SIZE, X, %xmm0) | |||
| VLOAD( 2 * SIZE, X, %xmm1) | |||
| VLOAD( 4 * SIZE, X, %xmm2) | |||
| VLOAD( 6 * SIZE, X, %xmm3) | |||
| prefetchnta A_PRE+64(X) | |||
| nop | |||
| vmovups %xmm4, -8 * SIZE(Y) | |||
| vmovups %xmm5, -6 * SIZE(Y) | |||
| prefetchnta B_PRE+64(Y) | |||
| nop | |||
| vmovups %xmm6, -4 * SIZE(Y) | |||
| vmovups %xmm7, -2 * SIZE(Y) | |||
| VLOAD( 8 * SIZE, X, %xmm4) | |||
| VLOAD(10 * SIZE, X, %xmm5) | |||
| subq $-16 * SIZE, Y | |||
| VLOAD(12 * SIZE, X, %xmm6) | |||
| VLOAD(14 * SIZE, X, %xmm7) | |||
| subq $-16 * SIZE, X | |||
| decq %rax | |||
| jg .L11 | |||
| ALIGN_3 | |||
| .L12: | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| vmovups %xmm4, -8 * SIZE(Y) | |||
| vmovups %xmm5, -6 * SIZE(Y) | |||
| vmovups %xmm6, -4 * SIZE(Y) | |||
| vmovups %xmm7, -2 * SIZE(Y) | |||
| subq $-16 * SIZE, Y | |||
| subq $-16 * SIZE, X | |||
| ALIGN_3 | |||
| .L13: | |||
| testq $8, M | |||
| jle .L14 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vmovups -12 * SIZE(X), %xmm2 | |||
| vmovups -10 * SIZE(X), %xmm3 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| vmovups %xmm2, -12 * SIZE(Y) | |||
| vmovups %xmm3, -10 * SIZE(Y) | |||
| addq $8 * SIZE, X | |||
| addq $8 * SIZE, Y | |||
| ALIGN_3 | |||
| .L14: | |||
| testq $4, M | |||
| jle .L15 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups -14 * SIZE(X), %xmm1 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| vmovups %xmm1, -14 * SIZE(Y) | |||
| addq $4 * SIZE, X | |||
| addq $4 * SIZE, Y | |||
| ALIGN_3 | |||
| .L15: | |||
| testq $2, M | |||
| jle .L16 | |||
| ALIGN_3 | |||
| vmovups -16 * SIZE(X), %xmm0 | |||
| vmovups %xmm0, -16 * SIZE(Y) | |||
| addq $2 * SIZE, X | |||
| addq $2 * SIZE, Y | |||
| ALIGN_3 | |||
| .L16: | |||
| testq $1, M | |||
| jle .L19 | |||
| ALIGN_3 | |||
| vmovsd -16 * SIZE(X), %xmm0 | |||
| vmovsd %xmm0, -16 * SIZE(Y) | |||
| ALIGN_3 | |||
| .L19: | |||
| xorq %rax,%rax | |||
| RESTOREREGISTERS | |||
| ret | |||
| ALIGN_3 | |||
| .L40: | |||
| movq M, %rax | |||
| sarq $3, %rax | |||
| jle .L45 | |||
| ALIGN_3 | |||
| .L41: | |||
| vmovsd (X), %xmm0 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm4 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm1 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm5 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm2 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm6 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm3 | |||
| addq INCX, X | |||
| vmovsd (X), %xmm7 | |||
| addq INCX, X | |||
| vmovsd %xmm0, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm4, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm1, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm5, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm2, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm6, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm3, (Y) | |||
| addq INCY, Y | |||
| vmovsd %xmm7, (Y) | |||
| addq INCY, Y | |||
| decq %rax | |||
| jg .L41 | |||
| ALIGN_3 | |||
| .L45: | |||
| movq M, %rax | |||
| andq $7, %rax | |||
| jle .L47 | |||
| ALIGN_3 | |||
| .L46: | |||
| vmovsd (X), %xmm0 | |||
| addq INCX, X | |||
| vmovsd %xmm0, (Y) | |||
| addq INCY, Y | |||
| decq %rax | |||
| jg .L46 | |||
| ALIGN_3 | |||
| .L47: | |||
| xorq %rax, %rax | |||
| RESTOREREGISTERS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,311 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N ARG1 /* rdi */ | |||
| #define X ARG2 /* rsi */ | |||
| #define INCX ARG3 /* rdx */ | |||
| #define Y ARG4 /* rcx */ | |||
| #ifndef WINDOWS_ABI | |||
| #define INCY ARG5 /* r8 */ | |||
| #else | |||
| #define INCY %r10 | |||
| #endif | |||
| #define A_PRE 512 | |||
| #include "l1param.h" | |||
| PROLOGUE | |||
| PROFCODE | |||
| #ifdef WINDOWS_ABI | |||
| movq 40(%rsp), INCY | |||
| #endif | |||
| SAVEREGISTERS | |||
| leaq (, INCX, SIZE), INCX | |||
| leaq (, INCY, SIZE), INCY | |||
| vxorps %xmm0, %xmm0 , %xmm0 | |||
| vxorps %xmm1, %xmm1 , %xmm1 | |||
| vxorps %xmm2, %xmm2 , %xmm2 | |||
| vxorps %xmm3, %xmm3 , %xmm3 | |||
| cmpq $0, N | |||
| jle .L999 | |||
| cmpq $SIZE, INCX | |||
| jne .L50 | |||
| cmpq $SIZE, INCY | |||
| jne .L50 | |||
| subq $-16 * SIZE, X | |||
| subq $-16 * SIZE, Y | |||
| testq $SIZE, Y | |||
| je .L10 | |||
| vmovsd -16 * SIZE(X), %xmm0 | |||
| vmulsd -16 * SIZE(Y), %xmm0 , %xmm0 | |||
| addq $1 * SIZE, X | |||
| addq $1 * SIZE, Y | |||
| decq N | |||
| ALIGN_2 | |||
| .L10: | |||
| movq N, %rax | |||
| sarq $4, %rax | |||
| jle .L14 | |||
| vmovups -16 * SIZE(X), %xmm4 | |||
| vmovups -14 * SIZE(X), %xmm5 | |||
| vmovups -12 * SIZE(X), %xmm6 | |||
| vmovups -10 * SIZE(X), %xmm7 | |||
| vmovups -8 * SIZE(X), %xmm8 | |||
| vmovups -6 * SIZE(X), %xmm9 | |||
| vmovups -4 * SIZE(X), %xmm10 | |||
| vmovups -2 * SIZE(X), %xmm11 | |||
| decq %rax | |||
| jle .L12 | |||
| ALIGN_3 | |||
| .L11: | |||
| prefetchnta A_PRE(Y) | |||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||
| prefetchnta A_PRE(X) | |||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||
| vmovups 0 * SIZE(X), %xmm4 | |||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||
| vmovups 2 * SIZE(X), %xmm5 | |||
| vmovups 4 * SIZE(X), %xmm6 | |||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||
| vmovups 6 * SIZE(X), %xmm7 | |||
| prefetchnta A_PRE+64(Y) | |||
| vmovups 8 * SIZE(X), %xmm8 | |||
| vmovups 10 * SIZE(X), %xmm9 | |||
| prefetchnta A_PRE+64(X) | |||
| vmovups 12 * SIZE(X), %xmm10 | |||
| vmovups 14 * SIZE(X), %xmm11 | |||
| subq $-16 * SIZE, X | |||
| subq $-16 * SIZE, Y | |||
| decq %rax | |||
| jg .L11 | |||
| ALIGN_3 | |||
| .L12: | |||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||
| vfmaddpd %xmm0 , -8 * SIZE(Y), %xmm8 , %xmm0 | |||
| vfmaddpd %xmm1 , -6 * SIZE(Y), %xmm9 , %xmm1 | |||
| vfmaddpd %xmm2 , -4 * SIZE(Y), %xmm10, %xmm2 | |||
| vfmaddpd %xmm3 , -2 * SIZE(Y), %xmm11, %xmm3 | |||
| subq $-16 * SIZE, X | |||
| subq $-16 * SIZE, Y | |||
| ALIGN_3 | |||
| .L14: | |||
| testq $15, N | |||
| jle .L999 | |||
| testq $8, N | |||
| jle .L15 | |||
| vmovups -16 * SIZE(X), %xmm4 | |||
| vmovups -14 * SIZE(X), %xmm5 | |||
| vmovups -12 * SIZE(X), %xmm6 | |||
| vmovups -10 * SIZE(X), %xmm7 | |||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||
| vfmaddpd %xmm2 , -12 * SIZE(Y), %xmm6 , %xmm2 | |||
| vfmaddpd %xmm3 , -10 * SIZE(Y), %xmm7 , %xmm3 | |||
| addq $8 * SIZE, X | |||
| addq $8 * SIZE, Y | |||
| ALIGN_3 | |||
| .L15: | |||
| testq $4, N | |||
| jle .L16 | |||
| vmovups -16 * SIZE(X), %xmm4 | |||
| vmovups -14 * SIZE(X), %xmm5 | |||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||
| vfmaddpd %xmm1 , -14 * SIZE(Y), %xmm5 , %xmm1 | |||
| addq $4 * SIZE, X | |||
| addq $4 * SIZE, Y | |||
| ALIGN_3 | |||
| .L16: | |||
| testq $2, N | |||
| jle .L17 | |||
| vmovups -16 * SIZE(X), %xmm4 | |||
| vfmaddpd %xmm0 , -16 * SIZE(Y), %xmm4 , %xmm0 | |||
| addq $2 * SIZE, X | |||
| addq $2 * SIZE, Y | |||
| ALIGN_3 | |||
| .L17: | |||
| testq $1, N | |||
| jle .L999 | |||
| vmovsd -16 * SIZE(X), %xmm4 | |||
| vmovsd -16 * SIZE(Y), %xmm5 | |||
| vfmaddpd %xmm0, %xmm4 , %xmm5 , %xmm0 | |||
| jmp .L999 | |||
| ALIGN_3 | |||
| .L50: | |||
| movq N, %rax | |||
| sarq $3, %rax | |||
| jle .L55 | |||
| ALIGN_3 | |||
| .L53: | |||
| vmovsd 0 * SIZE(X), %xmm4 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm8 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm5 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm9 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm6 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm10 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm7 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm11 | |||
| addq INCY, Y | |||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||
| vmovsd 0 * SIZE(X), %xmm4 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm8 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm5 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm9 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm6 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm10 | |||
| addq INCY, Y | |||
| vmovsd 0 * SIZE(X), %xmm7 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm11 | |||
| addq INCY, Y | |||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||
| vfmaddpd %xmm1 , %xmm5 , %xmm9 , %xmm1 | |||
| vfmaddpd %xmm2 , %xmm6 , %xmm10, %xmm2 | |||
| vfmaddpd %xmm3 , %xmm7 , %xmm11, %xmm3 | |||
| decq %rax | |||
| jg .L53 | |||
| ALIGN_3 | |||
| .L55: | |||
| movq N, %rax | |||
| andq $7, %rax | |||
| jle .L999 | |||
| ALIGN_3 | |||
| .L56: | |||
| vmovsd 0 * SIZE(X), %xmm4 | |||
| addq INCX, X | |||
| vmovsd 0 * SIZE(Y), %xmm8 | |||
| addq INCY, Y | |||
| vfmaddpd %xmm0 , %xmm4 , %xmm8 , %xmm0 | |||
| decq %rax | |||
| jg .L56 | |||
| ALIGN_3 | |||
| .L999: | |||
| vaddpd %xmm1, %xmm0 , %xmm0 | |||
| vaddpd %xmm3, %xmm2 , %xmm2 | |||
| vaddpd %xmm2, %xmm0 , %xmm0 | |||
| vhaddpd %xmm0, %xmm0 , %xmm0 | |||
| RESTOREREGISTERS | |||
| ret | |||
| EPILOGUE | |||