| @@ -3,7 +3,8 @@ CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| SDOTKERNEL = sdot.c | |||
| #DDOTKERNEL = ddot.c | |||
| CDOTKERNEL = cdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| DSYMV_U_KERNEL = dsymv_U.c | |||
| DSYMV_L_KERNEL = dsymv_L.c | |||
| @@ -26,11 +27,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | |||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | |||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | |||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | |||
| @@ -40,6 +41,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | |||
| @@ -49,6 +51,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | |||
| ZGEMMINCOPY = | |||
| ZGEMMITCOPY = | |||
| @@ -12,6 +12,9 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S | |||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | |||
| SDOTKERNEL = sdot.c | |||
| CDOTKERNEL = cdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| DDOTKERNEL = ddot_bulldozer.S | |||
| DCOPYKERNEL = dcopy_bulldozer.S | |||
| @@ -3,7 +3,10 @@ CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DDOTKERNEL = ddot_bullozer.S | |||
| CDOTKERNEL = cdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| DSYMV_U_KERNEL = dsymv_U.c | |||
| DSYMV_L_KERNEL = dsymv_L.c | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <complex.h> | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "cdot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "cdot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "cdot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "cdot_microk_sandy-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; | |||
| BLASLONG j=0; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[4] += x[j] * y[j+1] ; | |||
| dot[5] += x[j+1] * y[j] ; | |||
| dot[2] += x[j+2] * y[j+2] ; | |||
| dot[3] += x[j+3] * y[j+3] ; | |||
| dot[6] += x[j+2] * y[j+3] ; | |||
| dot[7] += x[j+3] * y[j+2] ; | |||
| dot[0] += x[j+4] * y[j+4] ; | |||
| dot[1] += x[j+5] * y[j+5] ; | |||
| dot[4] += x[j+4] * y[j+5] ; | |||
| dot[5] += x[j+5] * y[j+4] ; | |||
| dot[2] += x[j+6] * y[j+6] ; | |||
| dot[3] += x[j+7] * y[j+7] ; | |||
| dot[6] += x[j+6] * y[j+7] ; | |||
| dot[7] += x[j+7] * y[j+6] ; | |||
| j+=8; | |||
| i+=4; | |||
| } | |||
| d[0] = dot[0]; | |||
| d[1] = dot[1]; | |||
| d[2] = dot[2]; | |||
| d[3] = dot[3]; | |||
| d[4] = dot[4]; | |||
| d[5] = dot[5]; | |||
| d[6] = dot[6]; | |||
| d[7] = dot[7]; | |||
| } | |||
| #endif | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| FLOAT _Complex result; | |||
| FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| __real__ result = 0.0 ; | |||
| __imag__ result = 0.0 ; | |||
| return(result); | |||
| } | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -16; | |||
| if ( n1 ) | |||
| { | |||
| cdot_kernel_16(n1, x, y , dot ); | |||
| dot[0] += dot[2]; | |||
| dot[1] += dot[3]; | |||
| dot[4] += dot[6]; | |||
| dot[5] += dot[7]; | |||
| } | |||
| i = n1; | |||
| int j = i * 2; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[4] += x[j] * y[j+1] ; | |||
| dot[5] += x[j+1] * y[j] ; | |||
| j+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| i=0; | |||
| ix=0; | |||
| iy=0; | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| while(i < n) | |||
| { | |||
| dot[0] += x[ix] * y[iy] ; | |||
| dot[1] += x[ix+1] * y[iy+1] ; | |||
| dot[4] += x[ix] * y[iy+1] ; | |||
| dot[5] += x[ix+1] * y[iy] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| #if !defined(CONJ) | |||
| __real__ result = dot[0] - dot[1]; | |||
| __imag__ result = dot[4] + dot[5]; | |||
| #else | |||
| __real__ result = dot[0] + dot[1]; | |||
| __imag__ result = dot[4] - dot[5]; | |||
| #endif | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,196 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_16 1 | |||
| static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| if ( n <=1024 ) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||
| "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x | |||
| "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y | |||
| "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y | |||
| "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x | |||
| "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x | |||
| "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y | |||
| "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y | |||
| "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part | |||
| "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" | |||
| "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||
| "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" | |||
| "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||
| "addq $16 , %0 \n\t" | |||
| "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||
| "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||
| "subq $8 , %1 \n\t" | |||
| "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||
| "jnz 1b \n\t" | |||
| "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" | |||
| "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" | |||
| "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" | |||
| "vmovups %%xmm0, (%4) \n\t" | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| return; | |||
| } | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||
| "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x | |||
| "prefetcht0 384(%3,%0,4) \n\t" | |||
| "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y | |||
| "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y | |||
| "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x | |||
| "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x | |||
| "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y | |||
| "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y | |||
| "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part | |||
| "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" | |||
| "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||
| "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" | |||
| "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||
| "addq $16 , %0 \n\t" | |||
| "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||
| "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||
| "subq $8 , %1 \n\t" | |||
| "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||
| "jnz 1b \n\t" | |||
| "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" | |||
| "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" | |||
| "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" | |||
| "vmovups %%xmm0, (%4) \n\t" | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,165 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <complex.h> | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||
| #include "zdot_microk_bulldozer-2.c" | |||
| #elif defined(NEHALEM) | |||
| #include "zdot_microk_nehalem-2.c" | |||
| #elif defined(HASWELL) | |||
| #include "zdot_microk_haswell-2.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "zdot_microk_sandy-2.c" | |||
| #endif | |||
| #ifndef HAVE_KERNEL_8 | |||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||
| { | |||
| BLASLONG register i = 0; | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||
| BLASLONG j=0; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[2] += x[j] * y[j+1] ; | |||
| dot[3] += x[j+1] * y[j] ; | |||
| dot[0] += x[j+2] * y[j+2] ; | |||
| dot[1] += x[j+3] * y[j+3] ; | |||
| dot[2] += x[j+2] * y[j+3] ; | |||
| dot[3] += x[j+3] * y[j+2] ; | |||
| dot[0] += x[j+4] * y[j+4] ; | |||
| dot[1] += x[j+5] * y[j+5] ; | |||
| dot[2] += x[j+4] * y[j+5] ; | |||
| dot[3] += x[j+5] * y[j+4] ; | |||
| dot[0] += x[j+6] * y[j+6] ; | |||
| dot[1] += x[j+7] * y[j+7] ; | |||
| dot[2] += x[j+6] * y[j+7] ; | |||
| dot[3] += x[j+7] * y[j+6] ; | |||
| j+=8; | |||
| i+=4; | |||
| } | |||
| d[0] = dot[0]; | |||
| d[1] = dot[1]; | |||
| d[2] = dot[2]; | |||
| d[3] = dot[3]; | |||
| } | |||
| #endif | |||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG ix,iy; | |||
| FLOAT _Complex result; | |||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||
| if ( n <= 0 ) | |||
| { | |||
| __real__ result = 0.0 ; | |||
| __imag__ result = 0.0 ; | |||
| return(result); | |||
| } | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| int n1 = n & -8; | |||
| if ( n1 ) | |||
| zdot_kernel_8(n1, x, y , dot ); | |||
| i = n1; | |||
| int j = i * 2; | |||
| while( i < n ) | |||
| { | |||
| dot[0] += x[j] * y[j] ; | |||
| dot[1] += x[j+1] * y[j+1] ; | |||
| dot[2] += x[j] * y[j+1] ; | |||
| dot[3] += x[j+1] * y[j] ; | |||
| j+=2; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| i=0; | |||
| ix=0; | |||
| iy=0; | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| while(i < n) | |||
| { | |||
| dot[0] += x[ix] * y[iy] ; | |||
| dot[1] += x[ix+1] * y[iy+1] ; | |||
| dot[2] += x[ix] * y[iy+1] ; | |||
| dot[3] += x[ix+1] * y[iy] ; | |||
| ix += inc_x ; | |||
| iy += inc_y ; | |||
| i++ ; | |||
| } | |||
| } | |||
| #if !defined(CONJ) | |||
| __real__ result = dot[0] - dot[1]; | |||
| __imag__ result = dot[2] + dot[3]; | |||
| #else | |||
| __real__ result = dot[0] + dot[1]; | |||
| __imag__ result = dot[2] - dot[3]; | |||
| #endif | |||
| return(result); | |||
| } | |||
| @@ -0,0 +1,115 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_8 1 | |||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||
| "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||
| "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" | |||
| "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" | |||
| "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| "1: \n\t" | |||
| "prefetcht0 512(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x | |||
| "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x | |||
| "prefetcht0 512(%3,%0,8) \n\t" | |||
| "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y | |||
| "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y | |||
| "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x | |||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x | |||
| "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y | |||
| "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y | |||
| "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" | |||
| "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" | |||
| "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||
| "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||
| "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" | |||
| "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" | |||
| "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||
| "addq $8 , %0 \n\t" | |||
| "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||
| "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||
| "subq $4 , %1 \n\t" | |||
| "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||
| "jnz 1b \n\t" | |||
| "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" | |||
| "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" | |||
| "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" | |||
| "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" | |||
| "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" | |||
| "vmovups %%xmm0, (%4) \n\t" | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||