| @@ -18,6 +18,7 @@ ZDOTKERNEL = zdot.c | |||||
| SAXPYKERNEL = saxpy.c | SAXPYKERNEL = saxpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| CAXPYKERNEL = caxpy.c | CAXPYKERNEL = caxpy.c | ||||
| ZAXPYKERNEL = zaxpy.c | |||||
| SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | SGEMMKERNEL = sgemm_kernel_16x4_haswell.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | SGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| @@ -12,6 +12,7 @@ ZDOTKERNEL = zdot.c | |||||
| SAXPYKERNEL = saxpy.c | SAXPYKERNEL = saxpy.c | ||||
| DAXPYKERNEL = daxpy.c | DAXPYKERNEL = daxpy.c | ||||
| CAXPYKERNEL = caxpy.c | CAXPYKERNEL = caxpy.c | ||||
| ZAXPYKERNEL = zaxpy.c | |||||
| SGEMMKERNEL = sgemm_kernel_16x4_sandy.S | SGEMMKERNEL = sgemm_kernel_16x4_sandy.S | ||||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | SGEMMINCOPY = ../generic/gemm_ncopy_16.c | ||||
| @@ -29,8 +29,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| #include "common.h" | #include "common.h" | ||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||||
| #if defined(BULLDOZER) | |||||
| #include "zaxpy_microk_bulldozer-2.c" | #include "zaxpy_microk_bulldozer-2.c" | ||||
| #elif defined(PILEDRIVER) || defined(STEAMROLLER) | |||||
| #include "zaxpy_microk_steamroller-2.c" | |||||
| #elif defined(HASWELL) | |||||
| #include "zaxpy_microk_haswell-2.c" | |||||
| #elif defined(SANDYBRIDGE) | |||||
| #include "zaxpy_microk_sandy-2.c" | |||||
| #endif | #endif | ||||
| @@ -78,13 +84,13 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | if ( (inc_x == 1) && (inc_y == 1) ) | ||||
| { | { | ||||
| int n1 = n & -4; | |||||
| int n1 = n & -16; | |||||
| if ( n1 ) | if ( n1 ) | ||||
| { | { | ||||
| da[0] = da_r; | da[0] = da_r; | ||||
| da[1] = da_i; | da[1] = da_i; | ||||
| zaxpy_kernel_4(n1, x, y , &da ); | |||||
| zaxpy_kernel_4(n1, x, y , da ); | |||||
| ix = 2 * n1; | ix = 2 * n1; | ||||
| } | } | ||||
| i = n1; | i = n1; | ||||
| @@ -31,89 +31,154 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __att | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | ||||
| { | { | ||||
| #if !defined(CONJ) | |||||
| FLOAT mvec[2] = { -1.0, 1.0 }; | |||||
| #else | |||||
| FLOAT mvec[2] = { 1.0, -1.0 }; | |||||
| #endif | |||||
| BLASLONG register i = 0; | BLASLONG register i = 0; | ||||
| if ( n < 384 ) | |||||
| { | |||||
| __asm__ __volatile__ | __asm__ __volatile__ | ||||
| ( | ( | ||||
| "vzeroupper \n\t" | |||||
| "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | ||||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | ||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | ".align 16 \n\t" | ||||
| "1: \n\t" | "1: \n\t" | ||||
| "prefetcht0 768(%2,%0,8) \n\t" | |||||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x | "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x | ||||
| ".align 2 \n\t" | |||||
| "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x | "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x | ||||
| "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x | "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x | ||||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x | "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x | ||||
| "prefetcht0 768(%3,%0,8) \n\t" | |||||
| #if !defined(CONJ) | |||||
| "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm12 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vmulpd %%xmm1, %%xmm4 , %%xmm4 \n\t" | |||||
| "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 1 complex values from x | |||||
| "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 1 complex values from x | |||||
| "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 1 complex values from x | |||||
| "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 1 complex values from x | |||||
| "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm13 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | ||||
| "vmulpd %%xmm1, %%xmm6 , %%xmm6 \n\t" | |||||
| "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm14 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | ||||
| "vmulpd %%xmm1, %%xmm8 , %%xmm8 \n\t" | |||||
| "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm15 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | ||||
| "vmulpd %%xmm1, %%xmm10, %%xmm10 \n\t" | |||||
| "vaddsubpd %%xmm4, %%xmm12, %%xmm12 \n\t" | |||||
| "vaddsubpd %%xmm6, %%xmm13, %%xmm13 \n\t" | |||||
| "vaddsubpd %%xmm8, %%xmm14, %%xmm14 \n\t" | |||||
| "vaddsubpd %%xmm10,%%xmm15, %%xmm15 \n\t" | |||||
| #else | |||||
| "vmulpd %%xmm0, %%xmm5, %%xmm4 \n\t" // a_r*x_r, a_r*x_i | |||||
| "vmulpd %%xmm1, %%xmm5, %%xmm5 \n\t" // a_i*x_r, a_i*x_i | |||||
| "vmulpd %%xmm0, %%xmm7, %%xmm6 \n\t" // a_r*x_r, a_r*x_i | |||||
| "vmulpd %%xmm1, %%xmm7, %%xmm7 \n\t" // a_i*x_r, a_i*x_i | |||||
| "vmulpd %%xmm0, %%xmm9, %%xmm8 \n\t" // a_r*x_r, a_r*x_i | |||||
| "vmulpd %%xmm1, %%xmm9, %%xmm9 \n\t" // a_i*x_r, a_i*x_i | |||||
| "vmulpd %%xmm0, %%xmm11, %%xmm10 \n\t" // a_r*x_r, a_r*x_i | |||||
| "vmulpd %%xmm1, %%xmm11, %%xmm11 \n\t" // a_i*x_r, a_i*x_i | |||||
| "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" | |||||
| "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" | |||||
| "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" | |||||
| "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" | |||||
| "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" | |||||
| "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" | |||||
| "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part | |||||
| "vfmaddpd 64(%3,%0,8), %%xmm0 , %%xmm12, %%xmm12 \n\t" | |||||
| "vfmaddpd 80(%3,%0,8), %%xmm0 , %%xmm13, %%xmm13 \n\t" | |||||
| "vfmaddpd 96(%3,%0,8), %%xmm0 , %%xmm14, %%xmm14 \n\t" | |||||
| "vfmaddpd 112(%3,%0,8), %%xmm0 , %%xmm15, %%xmm15 \n\t" | |||||
| "vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||||
| "vfmaddpd %%xmm13, %%xmm1 , %%xmm6 , %%xmm13 \n\t" | |||||
| "vfmaddpd %%xmm14, %%xmm1 , %%xmm8 , %%xmm14 \n\t" | |||||
| "vfmaddpd %%xmm15, %%xmm1 , %%xmm10, %%xmm15 \n\t" | |||||
| "vmovups %%xmm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%xmm7 , 16(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm9 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm11, 48(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm12, 64(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm13, 80(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm14, 96(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm15,112(%3,%0,8) \n\t" | |||||
| "addq $16, %0 \n\t" | |||||
| "subq $8 , %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vaddsubpd %%xmm4 ,%%xmm5 , %%xmm4 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm4 , %%xmm4 \n\t" // exchange real and imag part | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||||
| "vaddsubpd %%xmm6 ,%%xmm7 , %%xmm6 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm6 , %%xmm6 \n\t" // exchange real and imag part | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | |||||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" | |||||
| #endif | |||||
| "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||||
| "vaddsubpd %%xmm8 ,%%xmm9 , %%xmm8 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm8 , %%xmm8 \n\t" // exchange real and imag part | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||||
| "vaddsubpd %%xmm10,%%xmm11, %%xmm10 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm10, %%xmm10 \n\t" // exchange real and imag part | |||||
| "prefetcht0 512(%2,%0,8) \n\t" | |||||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 1 complex values from x | |||||
| "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 1 complex values from x | |||||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 complex values from x | |||||
| "vaddpd (%3,%0,8) ,%%xmm4 , %%xmm12 \n\t" | |||||
| "vaddpd 16(%3,%0,8) ,%%xmm6 , %%xmm13 \n\t" | |||||
| "vaddpd 32(%3,%0,8) ,%%xmm8 , %%xmm14 \n\t" | |||||
| "vaddpd 48(%3,%0,8) ,%%xmm10, %%xmm15 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | |||||
| "prefetcht0 512(%3,%0,8) \n\t" | |||||
| "vfmaddpd (%3,%0,8), %%xmm0 , %%xmm5, %%xmm5 \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vfmaddpd 16(%3,%0,8), %%xmm0 , %%xmm7, %%xmm7 \n\t" | |||||
| "vfmaddpd 32(%3,%0,8), %%xmm0 , %%xmm9, %%xmm9 \n\t" | |||||
| "vfmaddpd 48(%3,%0,8), %%xmm0 , %%xmm11,%%xmm11 \n\t" | |||||
| #endif | |||||
| "vfmaddpd %%xmm5 , %%xmm1 , %%xmm4 , %%xmm5 \n\t" | |||||
| "vfmaddpd %%xmm7 , %%xmm1 , %%xmm6 , %%xmm7 \n\t" | |||||
| "vfmaddpd %%xmm9 , %%xmm1 , %%xmm8 , %%xmm9 \n\t" | |||||
| "vfmaddpd %%xmm11, %%xmm1 , %%xmm10, %%xmm11 \n\t" | |||||
| "vmovups %%xmm12, (%3,%0,8) \n\t" | |||||
| "vmovups %%xmm13, 16(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm14, 32(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm15, 48(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%xmm7 , 16(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm9 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm11, 48(%3,%0,8) \n\t" | |||||
| "addq $8 , %0 \n\t" | "addq $8 , %0 \n\t" | ||||
| "subq $4 , %1 \n\t" | |||||
| "subq $4, %1 \n\t" | |||||
| "jnz 1b \n\t" | "jnz 1b \n\t" | ||||
| "vzeroupper \n\t" | |||||
| : | : | ||||
| : | : | ||||
| @@ -121,15 +186,15 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| "r" (n), // 1 | "r" (n), // 1 | ||||
| "r" (x), // 2 | "r" (x), // 2 | ||||
| "r" (y), // 3 | "r" (y), // 3 | ||||
| "r" (alpha) // 4 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | : "cc", | ||||
| "%xmm0", "%xmm1", | "%xmm0", "%xmm1", | ||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | "%xmm4", "%xmm5", "%xmm6", "%xmm7", | ||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | "%xmm8", "%xmm9", "%xmm10", "%xmm11", | ||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | "memory" | ||||
| ); | ); | ||||
| } | |||||
| } | |||||
| @@ -0,0 +1,132 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_4 1 | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; | |||||
| #else | |||||
| FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; | |||||
| #endif | |||||
| BLASLONG register i = 0; | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha | |||||
| "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vmovups (%2,%0,8), %%ymm5 \n\t" // 2 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 2 complex values from x | |||||
| "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 2 complex values from x | |||||
| "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 2 complex values from x | |||||
| "vmovups 128(%2,%0,8), %%ymm12 \n\t" // 2 complex values from x | |||||
| "vmovups 160(%2,%0,8), %%ymm13 \n\t" // 2 complex values from x | |||||
| "vmovups 192(%2,%0,8), %%ymm14 \n\t" // 2 complex values from x | |||||
| "vmovups 224(%2,%0,8), %%ymm15 \n\t" // 2 complex values from x | |||||
| "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part | |||||
| "vfmadd213pd (%3,%0,8), %%ymm0 , %%ymm5 \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vfmadd213pd 32(%3,%0,8), %%ymm0 , %%ymm7 \n\t" | |||||
| "vfmadd213pd 64(%3,%0,8), %%ymm0 , %%ymm9 \n\t" | |||||
| "vfmadd213pd 96(%3,%0,8), %%ymm0 , %%ymm11 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm5 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm7 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm9 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm10, %%ymm11 \n\t" | |||||
| "vpermilpd $0x5 , %%ymm12, %%ymm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm13, %%ymm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm14, %%ymm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm15, %%ymm10 \n\t" // exchange real and imag part | |||||
| "vfmadd213pd 128(%3,%0,8), %%ymm0 , %%ymm12 \n\t" | |||||
| "vfmadd213pd 160(%3,%0,8), %%ymm0 , %%ymm13 \n\t" | |||||
| "vfmadd213pd 192(%3,%0,8), %%ymm0 , %%ymm14 \n\t" | |||||
| "vfmadd213pd 224(%3,%0,8), %%ymm0 , %%ymm15 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm4 , %%ymm12 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm6 , %%ymm13 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm8 , %%ymm14 \n\t" | |||||
| "vfmadd231pd %%ymm1 , %%ymm10, %%ymm15 \n\t" | |||||
| "vmovups %%ymm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%ymm7 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm9 , 64(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm11, 96(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm12,128(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm13,160(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm14,192(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm15,224(%3,%0,8) \n\t" | |||||
| "addq $32, %0 \n\t" | |||||
| "subq $16, %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| } | |||||
| @@ -0,0 +1,198 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_4 1 | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| FLOAT mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; | |||||
| #else | |||||
| FLOAT mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; | |||||
| #endif | |||||
| BLASLONG register i = 0; | |||||
| if ( n < 1280 ) | |||||
| { | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha | |||||
| "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x | |||||
| "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x | |||||
| "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x | |||||
| "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part | |||||
| "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" | |||||
| "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" | |||||
| "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" | |||||
| "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" | |||||
| "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" | |||||
| "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" | |||||
| "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" | |||||
| "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" | |||||
| "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" | |||||
| "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" | |||||
| "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" | |||||
| "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" | |||||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||||
| "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" | |||||
| "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" | |||||
| "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" | |||||
| "vmovups %%ymm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%ymm7 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm9 , 64(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm11, 96(%3,%0,8) \n\t" | |||||
| "addq $16, %0 \n\t" | |||||
| "subq $8 , %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "memory" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vbroadcastsd (%4), %%ymm0 \n\t" // real part of alpha | |||||
| "vbroadcastsd 8(%4), %%ymm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%ymm1 , %%ymm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%ymm0 , %%ymm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "prefetcht0 512(%2,%0,8) \n\t" | |||||
| "prefetcht0 576(%2,%0,8) \n\t" | |||||
| "vmovups (%2,%0,8), %%ymm5 \n\t" // 4 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 32(%2,%0,8), %%ymm7 \n\t" // 4 complex values from x | |||||
| "vmovups 64(%2,%0,8), %%ymm9 \n\t" // 4 complex values from x | |||||
| "vmovups 96(%2,%0,8), %%ymm11 \n\t" // 4 complex values from x | |||||
| "vpermilpd $0x5 , %%ymm5 , %%ymm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm7 , %%ymm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm9 , %%ymm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x5 , %%ymm11, %%ymm10 \n\t" // exchange real and imag part | |||||
| "vmulpd %%ymm5 , %%ymm0 , %%ymm5 \n\t" | |||||
| "vmulpd %%ymm7 , %%ymm0 , %%ymm7 \n\t" | |||||
| "vmulpd %%ymm9 , %%ymm0 , %%ymm9 \n\t" | |||||
| "vmulpd %%ymm11, %%ymm0 , %%ymm11 \n\t" | |||||
| "prefetcht0 512(%3,%0,8) \n\t" | |||||
| "prefetcht0 576(%3,%0,8) \n\t" | |||||
| "vaddpd (%3,%0,8), %%ymm5 , %%ymm5 \n\t" | |||||
| "vaddpd 32(%3,%0,8), %%ymm7 , %%ymm7 \n\t" | |||||
| "vaddpd 64(%3,%0,8), %%ymm9 , %%ymm9 \n\t" | |||||
| "vaddpd 96(%3,%0,8), %%ymm11, %%ymm11 \n\t" | |||||
| "vmulpd %%ymm4 , %%ymm1 , %%ymm4 \n\t" | |||||
| "vmulpd %%ymm6 , %%ymm1 , %%ymm6 \n\t" | |||||
| "vmulpd %%ymm8 , %%ymm1 , %%ymm8 \n\t" | |||||
| "vmulpd %%ymm10, %%ymm1 , %%ymm10 \n\t" | |||||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||||
| "vaddpd %%ymm6 , %%ymm7 , %%ymm7 \n\t" | |||||
| "vaddpd %%ymm8 , %%ymm9 , %%ymm9 \n\t" | |||||
| "vaddpd %%ymm10, %%ymm11, %%ymm11 \n\t" | |||||
| "vmovups %%ymm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%ymm7 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm9 , 64(%3,%0,8) \n\t" | |||||
| "vmovups %%ymm11, 96(%3,%0,8) \n\t" | |||||
| "addq $16, %0 \n\t" | |||||
| "subq $8 , %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "memory" | |||||
| ); | |||||
| } | |||||
| @@ -0,0 +1,200 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_4 1 | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline)); | |||||
| static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||||
| { | |||||
| #if !defined(CONJ) | |||||
| FLOAT mvec[2] = { -1.0, 1.0 }; | |||||
| #else | |||||
| FLOAT mvec[2] = { 1.0, -1.0 }; | |||||
| #endif | |||||
| BLASLONG register i = 0; | |||||
| if ( n < 640 ) | |||||
| { | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | |||||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x | |||||
| "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x | |||||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x | |||||
| "vmovups 64(%2,%0,8), %%xmm12 \n\t" // 2 complex values from x | |||||
| "vmovups 80(%2,%0,8), %%xmm13 \n\t" // 2 complex values from x | |||||
| "vmovups 96(%2,%0,8), %%xmm14 \n\t" // 2 complex values from x | |||||
| "vmovups 112(%2,%0,8), %%xmm15 \n\t" // 2 complex values from x | |||||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | |||||
| "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" | |||||
| "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" | |||||
| "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm12, %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm13, %%xmm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm14, %%xmm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm15, %%xmm10 \n\t" // exchange real and imag part | |||||
| "vfmadd213pd 64(%3,%0,8), %%xmm0 , %%xmm12 \n\t" | |||||
| "vfmadd213pd 80(%3,%0,8), %%xmm0 , %%xmm13 \n\t" | |||||
| "vfmadd213pd 96(%3,%0,8), %%xmm0 , %%xmm14 \n\t" | |||||
| "vfmadd213pd 112(%3,%0,8), %%xmm0 , %%xmm15 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm13 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm14 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm10, %%xmm15 \n\t" | |||||
| "vmovups %%xmm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%xmm7 , 16(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm9 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm11, 48(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm12, 64(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm13, 80(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm14, 96(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm15,112(%3,%0,8) \n\t" | |||||
| "addq $16, %0 \n\t" | |||||
| "subq $8 , %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vmovddup (%4), %%xmm0 \n\t" // real part of alpha | |||||
| "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha | |||||
| #if !defined(CONJ) | |||||
| "vmulpd (%5), %%xmm1 , %%xmm1 \n\t" | |||||
| #else | |||||
| "vmulpd (%5), %%xmm0 , %%xmm0 \n\t" | |||||
| #endif | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "prefetcht0 512(%2,%0,8) \n\t" | |||||
| "vmovups (%2,%0,8), %%xmm5 \n\t" // 2 complex values from x | |||||
| ".align 2 \n\t" | |||||
| "vmovups 16(%2,%0,8), %%xmm7 \n\t" // 2 complex values from x | |||||
| "vmovups 32(%2,%0,8), %%xmm9 \n\t" // 2 complex values from x | |||||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 2 complex values from x | |||||
| "vpermilpd $0x1 , %%xmm5 , %%xmm4 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm7 , %%xmm6 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm9 , %%xmm8 \n\t" // exchange real and imag part | |||||
| "vpermilpd $0x1 , %%xmm11, %%xmm10 \n\t" // exchange real and imag part | |||||
| "prefetcht0 512(%3,%0,8) \n\t" | |||||
| "vfmadd213pd (%3,%0,8), %%xmm0 , %%xmm5 \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vfmadd213pd 16(%3,%0,8), %%xmm0 , %%xmm7 \n\t" | |||||
| "vfmadd213pd 32(%3,%0,8), %%xmm0 , %%xmm9 \n\t" | |||||
| "vfmadd213pd 48(%3,%0,8), %%xmm0 , %%xmm11 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm4 , %%xmm5 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm6 , %%xmm7 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm8 , %%xmm9 \n\t" | |||||
| "vfmadd231pd %%xmm1 , %%xmm10, %%xmm11 \n\t" | |||||
| "vmovups %%xmm5 , (%3,%0,8) \n\t" | |||||
| ".align 2 \n\t" | |||||
| "vmovups %%xmm7 , 16(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm9 , 32(%3,%0,8) \n\t" | |||||
| "vmovups %%xmm11, 48(%3,%0,8) \n\t" | |||||
| "addq $8 , %0 \n\t" | |||||
| "subq $4 , %1 \n\t" | |||||
| "jnz 1b \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (alpha), // 4 | |||||
| "r" (mvec) // 5 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "memory" | |||||
| ); | |||||
| } | |||||
| @@ -1,11 +1,11 @@ | |||||
| SEP: Data file for testing Symmetric Eigenvalue Problem routines | SEP: Data file for testing Symmetric Eigenvalue Problem routines | ||||
| 8 Number of values of N | 8 Number of values of N | ||||
| 0 1 2 3 5 19 20 21 Values of N (dimension) | |||||
| 0 1 2 3 5 18 19 21 Values of N (dimension) | |||||
| 5 Number of values of NB | 5 Number of values of NB | ||||
| 1 3 3 3 10 Values of NB (blocksize) | 1 3 3 3 10 Values of NB (blocksize) | ||||
| 2 2 2 2 2 Values of NBMIN (minimum blocksize) | 2 2 2 2 2 Values of NBMIN (minimum blocksize) | ||||
| 1 0 5 9 1 Values of NX (crossover point) | 1 0 5 9 1 Values of NX (crossover point) | ||||
| 170.0 Threshold value | |||||
| 300.0 Threshold value | |||||
| T Put T to test the LAPACK routines | T Put T to test the LAPACK routines | ||||
| T Put T to test the driver routines | T Put T to test the driver routines | ||||
| T Put T to test the error exits | T Put T to test the error exits | ||||
| @@ -1,6 +1,6 @@ | |||||
| Data file for testing ZCGESV/ZCPOSV LAPACK routines | Data file for testing ZCGESV/ZCPOSV LAPACK routines | ||||
| 11 Number of values of M | 11 Number of values of M | ||||
| 0 1 2 13 17 45 78 91 101 120 132 Values of M (row dimension) | |||||
| 0 1 2 13 17 45 78 91 101 121 132 Values of M (row dimension) | |||||
| 4 Number of values of NRHS | 4 Number of values of NRHS | ||||
| 1 2 15 16 Values of NRHS (number of right hand sides) | 1 2 15 16 Values of NRHS (number of right hand sides) | ||||
| 30.0 Threshold value of test ratio | 30.0 Threshold value of test ratio | ||||