| @@ -3,7 +3,8 @@ CAXPYKERNEL = caxpy.c | |||||
| ZAXPYKERNEL = zaxpy.c | ZAXPYKERNEL = zaxpy.c | ||||
| SDOTKERNEL = sdot.c | SDOTKERNEL = sdot.c | ||||
| #DDOTKERNEL = ddot.c | |||||
| CDOTKERNEL = cdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| DSYMV_U_KERNEL = dsymv_U.c | DSYMV_U_KERNEL = dsymv_U.c | ||||
| DSYMV_L_KERNEL = dsymv_L.c | DSYMV_L_KERNEL = dsymv_L.c | ||||
| @@ -26,11 +27,11 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||||
| SGEMMITCOPY = ../generic/gemm_tcopy_16.c | SGEMMITCOPY = ../generic/gemm_tcopy_16.c | ||||
| SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | SGEMMONCOPY = gemm_ncopy_2_bulldozer.S | ||||
| SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S | ||||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S | ||||
| DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S | ||||
| DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S | ||||
| @@ -40,6 +41,7 @@ DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S | ||||
| CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | CGEMMINCOPY = ../generic/zgemm_ncopy_4.c | ||||
| CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | CGEMMITCOPY = ../generic/zgemm_tcopy_4.c | ||||
| @@ -49,6 +51,7 @@ CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | ||||
| ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S | ||||
| ZGEMMINCOPY = | ZGEMMINCOPY = | ||||
| ZGEMMITCOPY = | ZGEMMITCOPY = | ||||
| @@ -12,6 +12,9 @@ DGEMVNKERNEL = dgemv_n_bulldozer.S | |||||
| DGEMVTKERNEL = dgemv_t_bulldozer.S | DGEMVTKERNEL = dgemv_t_bulldozer.S | ||||
| SDOTKERNEL = sdot.c | SDOTKERNEL = sdot.c | ||||
| CDOTKERNEL = cdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| DDOTKERNEL = ddot_bulldozer.S | DDOTKERNEL = ddot_bulldozer.S | ||||
| DCOPYKERNEL = dcopy_bulldozer.S | DCOPYKERNEL = dcopy_bulldozer.S | ||||
| @@ -3,7 +3,10 @@ CAXPYKERNEL = caxpy.c | |||||
| ZAXPYKERNEL = zaxpy.c | ZAXPYKERNEL = zaxpy.c | ||||
| SDOTKERNEL = sdot.c | SDOTKERNEL = sdot.c | ||||
| DDOTKERNEL = ddot.c | |||||
| DDOTKERNEL = ddot_bullozer.S | |||||
| CDOTKERNEL = cdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| DSYMV_U_KERNEL = dsymv_U.c | DSYMV_U_KERNEL = dsymv_U.c | ||||
| DSYMV_L_KERNEL = dsymv_L.c | DSYMV_L_KERNEL = dsymv_L.c | ||||
| @@ -0,0 +1,174 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <complex.h> | |||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||||
| #include "cdot_microk_bulldozer-2.c" | |||||
| #elif defined(NEHALEM) | |||||
| #include "cdot_microk_nehalem-2.c" | |||||
| #elif defined(HASWELL) | |||||
| #include "cdot_microk_haswell-2.c" | |||||
| #elif defined(SANDYBRIDGE) | |||||
| #include "cdot_microk_sandy-2.c" | |||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | |||||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||||
| static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot[8] = { 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; | |||||
| BLASLONG j=0; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[4] += x[j] * y[j+1] ; | |||||
| dot[5] += x[j+1] * y[j] ; | |||||
| dot[2] += x[j+2] * y[j+2] ; | |||||
| dot[3] += x[j+3] * y[j+3] ; | |||||
| dot[6] += x[j+2] * y[j+3] ; | |||||
| dot[7] += x[j+3] * y[j+2] ; | |||||
| dot[0] += x[j+4] * y[j+4] ; | |||||
| dot[1] += x[j+5] * y[j+5] ; | |||||
| dot[4] += x[j+4] * y[j+5] ; | |||||
| dot[5] += x[j+5] * y[j+4] ; | |||||
| dot[2] += x[j+6] * y[j+6] ; | |||||
| dot[3] += x[j+7] * y[j+7] ; | |||||
| dot[6] += x[j+6] * y[j+7] ; | |||||
| dot[7] += x[j+7] * y[j+6] ; | |||||
| j+=8; | |||||
| i+=4; | |||||
| } | |||||
| d[0] = dot[0]; | |||||
| d[1] = dot[1]; | |||||
| d[2] = dot[2]; | |||||
| d[3] = dot[3]; | |||||
| d[4] = dot[4]; | |||||
| d[5] = dot[5]; | |||||
| d[6] = dot[6]; | |||||
| d[7] = dot[7]; | |||||
| } | |||||
| #endif | |||||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| FLOAT _Complex result; | |||||
| FLOAT dot[8] = { 0.0, 0.0, 0.0 , 0.0, 0.0, 0.0, 0.0, 0.0 } ; | |||||
| if ( n <= 0 ) | |||||
| { | |||||
| __real__ result = 0.0 ; | |||||
| __imag__ result = 0.0 ; | |||||
| return(result); | |||||
| } | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| int n1 = n & -16; | |||||
| if ( n1 ) | |||||
| { | |||||
| cdot_kernel_16(n1, x, y , dot ); | |||||
| dot[0] += dot[2]; | |||||
| dot[1] += dot[3]; | |||||
| dot[4] += dot[6]; | |||||
| dot[5] += dot[7]; | |||||
| } | |||||
| i = n1; | |||||
| int j = i * 2; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[4] += x[j] * y[j+1] ; | |||||
| dot[5] += x[j+1] * y[j] ; | |||||
| j+=2; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| i=0; | |||||
| ix=0; | |||||
| iy=0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while(i < n) | |||||
| { | |||||
| dot[0] += x[ix] * y[iy] ; | |||||
| dot[1] += x[ix+1] * y[iy+1] ; | |||||
| dot[4] += x[ix] * y[iy+1] ; | |||||
| dot[5] += x[ix+1] * y[iy] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| #if !defined(CONJ) | |||||
| __real__ result = dot[0] - dot[1]; | |||||
| __imag__ result = dot[4] + dot[5]; | |||||
| #else | |||||
| __real__ result = dot[0] + dot[1]; | |||||
| __imag__ result = dot[4] - dot[5]; | |||||
| #endif | |||||
| return(result); | |||||
| } | |||||
| @@ -0,0 +1,196 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||||
| static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| if ( n <=1024 ) | |||||
| { | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||||
| "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||||
| "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||||
| "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||||
| "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x | |||||
| "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y | |||||
| "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y | |||||
| "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x | |||||
| "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x | |||||
| "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y | |||||
| "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y | |||||
| "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part | |||||
| "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" | |||||
| "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||||
| "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" | |||||
| "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||||
| "addq $16 , %0 \n\t" | |||||
| "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||||
| "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||||
| "subq $8 , %1 \n\t" | |||||
| "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||||
| "jnz 1b \n\t" | |||||
| "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" | |||||
| "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" | |||||
| "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" | |||||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||||
| "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" | |||||
| "vmovups %%xmm0, (%4) \n\t" | |||||
| "vmovups %%xmm4, 16(%4) \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (dot) // 4 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| return; | |||||
| } | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vxorps %%xmm0, %%xmm0, %%xmm0 \n\t" | |||||
| "vxorps %%xmm1, %%xmm1, %%xmm1 \n\t" | |||||
| "vxorps %%xmm2, %%xmm2, %%xmm2 \n\t" | |||||
| "vxorps %%xmm3, %%xmm3, %%xmm3 \n\t" | |||||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "prefetcht0 384(%2,%0,4) \n\t" | |||||
| "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x | |||||
| "vmovups 16(%2,%0,4), %%xmm9 \n\t" // 2 * x | |||||
| "prefetcht0 384(%3,%0,4) \n\t" | |||||
| "vmovups (%3,%0,4), %%xmm12 \n\t" // 2 * y | |||||
| "vmovups 16(%3,%0,4), %%xmm13 \n\t" // 2 * y | |||||
| "vmovups 32(%2,%0,4), %%xmm10 \n\t" // 2 * x | |||||
| "vmovups 48(%2,%0,4), %%xmm11 \n\t" // 2 * x | |||||
| "vmovups 32(%3,%0,4), %%xmm14 \n\t" // 2 * y | |||||
| "vmovups 48(%3,%0,4), %%xmm15 \n\t" // 2 * y | |||||
| "vfmaddps %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddps %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilps $0xb1 , %%xmm12, %%xmm12 \n\t" // exchange real and imag part | |||||
| "vpermilps $0xb1 , %%xmm13, %%xmm13 \n\t" | |||||
| "vfmaddps %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddps %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilps $0xb1 , %%xmm14, %%xmm14 \n\t" | |||||
| "vpermilps $0xb1 , %%xmm15, %%xmm15 \n\t" | |||||
| "vfmaddps %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||||
| "addq $16 , %0 \n\t" | |||||
| "vfmaddps %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||||
| "vfmaddps %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||||
| "subq $8 , %1 \n\t" | |||||
| "vfmaddps %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||||
| "jnz 1b \n\t" | |||||
| "vaddps %%xmm0, %%xmm1, %%xmm0 \n\t" | |||||
| "vaddps %%xmm2, %%xmm3, %%xmm2 \n\t" | |||||
| "vaddps %%xmm0, %%xmm2, %%xmm0 \n\t" | |||||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||||
| "vaddps %%xmm6, %%xmm7, %%xmm6 \n\t" | |||||
| "vaddps %%xmm4, %%xmm6, %%xmm4 \n\t" | |||||
| "vmovups %%xmm0, (%4) \n\t" | |||||
| "vmovups %%xmm4, 16(%4) \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (dot) // 4 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| } | |||||
| @@ -0,0 +1,165 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2015, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <complex.h> | |||||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) | |||||
| #include "zdot_microk_bulldozer-2.c" | |||||
| #elif defined(NEHALEM) | |||||
| #include "zdot_microk_nehalem-2.c" | |||||
| #elif defined(HASWELL) | |||||
| #include "zdot_microk_haswell-2.c" | |||||
| #elif defined(SANDYBRIDGE) | |||||
| #include "zdot_microk_sandy-2.c" | |||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||||
| BLASLONG j=0; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[2] += x[j] * y[j+1] ; | |||||
| dot[3] += x[j+1] * y[j] ; | |||||
| dot[0] += x[j+2] * y[j+2] ; | |||||
| dot[1] += x[j+3] * y[j+3] ; | |||||
| dot[2] += x[j+2] * y[j+3] ; | |||||
| dot[3] += x[j+3] * y[j+2] ; | |||||
| dot[0] += x[j+4] * y[j+4] ; | |||||
| dot[1] += x[j+5] * y[j+5] ; | |||||
| dot[2] += x[j+4] * y[j+5] ; | |||||
| dot[3] += x[j+5] * y[j+4] ; | |||||
| dot[0] += x[j+6] * y[j+6] ; | |||||
| dot[1] += x[j+7] * y[j+7] ; | |||||
| dot[2] += x[j+6] * y[j+7] ; | |||||
| dot[3] += x[j+7] * y[j+6] ; | |||||
| j+=8; | |||||
| i+=4; | |||||
| } | |||||
| d[0] = dot[0]; | |||||
| d[1] = dot[1]; | |||||
| d[2] = dot[2]; | |||||
| d[3] = dot[3]; | |||||
| } | |||||
| #endif | |||||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| FLOAT _Complex result; | |||||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||||
| if ( n <= 0 ) | |||||
| { | |||||
| __real__ result = 0.0 ; | |||||
| __imag__ result = 0.0 ; | |||||
| return(result); | |||||
| } | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| int n1 = n & -8; | |||||
| if ( n1 ) | |||||
| zdot_kernel_8(n1, x, y , dot ); | |||||
| i = n1; | |||||
| int j = i * 2; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[2] += x[j] * y[j+1] ; | |||||
| dot[3] += x[j+1] * y[j] ; | |||||
| j+=2; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| i=0; | |||||
| ix=0; | |||||
| iy=0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while(i < n) | |||||
| { | |||||
| dot[0] += x[ix] * y[iy] ; | |||||
| dot[1] += x[ix+1] * y[iy+1] ; | |||||
| dot[2] += x[ix] * y[iy+1] ; | |||||
| dot[3] += x[ix+1] * y[iy] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| #if !defined(CONJ) | |||||
| __real__ result = dot[0] - dot[1]; | |||||
| __imag__ result = dot[2] + dot[3]; | |||||
| #else | |||||
| __real__ result = dot[0] + dot[1]; | |||||
| __imag__ result = dot[2] - dot[3]; | |||||
| #endif | |||||
| return(result); | |||||
| } | |||||
| @@ -0,0 +1,115 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2014, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "vzeroupper \n\t" | |||||
| "vxorpd %%xmm0, %%xmm0, %%xmm0 \n\t" | |||||
| "vxorpd %%xmm1, %%xmm1, %%xmm1 \n\t" | |||||
| "vxorpd %%xmm2, %%xmm2, %%xmm2 \n\t" | |||||
| "vxorpd %%xmm3, %%xmm3, %%xmm3 \n\t" | |||||
| "vxorpd %%xmm4, %%xmm4, %%xmm4 \n\t" | |||||
| "vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" | |||||
| "vxorpd %%xmm6, %%xmm6, %%xmm6 \n\t" | |||||
| "vxorpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||||
| ".align 16 \n\t" | |||||
| "1: \n\t" | |||||
| "prefetcht0 512(%2,%0,8) \n\t" | |||||
| "vmovups (%2,%0,8), %%xmm8 \n\t" // 1 * x | |||||
| "vmovups 16(%2,%0,8), %%xmm9 \n\t" // 1 * x | |||||
| "prefetcht0 512(%3,%0,8) \n\t" | |||||
| "vmovups (%3,%0,8), %%xmm12 \n\t" // 1 * y | |||||
| "vmovups 16(%3,%0,8), %%xmm13 \n\t" // 1 * y | |||||
| "vmovups 32(%2,%0,8), %%xmm10 \n\t" // 1 * x | |||||
| "vmovups 48(%2,%0,8), %%xmm11 \n\t" // 1 * x | |||||
| "vmovups 32(%3,%0,8), %%xmm14 \n\t" // 1 * y | |||||
| "vmovups 48(%3,%0,8), %%xmm15 \n\t" // 1 * y | |||||
| "vfmaddpd %%xmm0, %%xmm8 , %%xmm12, %%xmm0 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddpd %%xmm1, %%xmm9 , %%xmm13, %%xmm1 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilpd $0x1 , %%xmm12, %%xmm12 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm13, %%xmm13 \n\t" | |||||
| "vfmaddpd %%xmm2, %%xmm10, %%xmm14, %%xmm2 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vfmaddpd %%xmm3, %%xmm11, %%xmm15, %%xmm3 \n\t" // x_r * y_r, x_i * y_i | |||||
| "vpermilpd $0x1 , %%xmm14, %%xmm14 \n\t" | |||||
| "vpermilpd $0x1 , %%xmm15, %%xmm15 \n\t" | |||||
| "vfmaddpd %%xmm4, %%xmm8 , %%xmm12, %%xmm4 \n\t" // x_r * y_i, x_i * y_r | |||||
| "addq $8 , %0 \n\t" | |||||
| "vfmaddpd %%xmm5, %%xmm9 , %%xmm13, %%xmm5 \n\t" // x_r * y_i, x_i * y_r | |||||
| "vfmaddpd %%xmm6, %%xmm10, %%xmm14, %%xmm6 \n\t" // x_r * y_i, x_i * y_r | |||||
| "subq $4 , %1 \n\t" | |||||
| "vfmaddpd %%xmm7, %%xmm11, %%xmm15, %%xmm7 \n\t" // x_r * y_i, x_i * y_r | |||||
| "jnz 1b \n\t" | |||||
| "vaddpd %%xmm0, %%xmm1, %%xmm0 \n\t" | |||||
| "vaddpd %%xmm2, %%xmm3, %%xmm2 \n\t" | |||||
| "vaddpd %%xmm0, %%xmm2, %%xmm0 \n\t" | |||||
| "vaddpd %%xmm4, %%xmm5, %%xmm4 \n\t" | |||||
| "vaddpd %%xmm6, %%xmm7, %%xmm6 \n\t" | |||||
| "vaddpd %%xmm4, %%xmm6, %%xmm4 \n\t" | |||||
| "vmovups %%xmm0, (%4) \n\t" | |||||
| "vmovups %%xmm4, 16(%4) \n\t" | |||||
| "vzeroupper \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x), // 2 | |||||
| "r" (y), // 3 | |||||
| "r" (dot) // 4 | |||||
| : "cc", | |||||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||||
| "memory" | |||||
| ); | |||||
| } | |||||