From e50a9330374dba70d406d6be37ed65f46214621a Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 12:28:44 +0200 Subject: [PATCH 1/6] added optimized dscal kernel for bulldozer --- kernel/x86_64/KERNEL.BULLDOZER | 2 + kernel/x86_64/dscal.c | 143 ++++++++++++++++ kernel/x86_64/dscal_microk_bulldozer-2.c | 206 +++++++++++++++++++++++ 3 files changed, 351 insertions(+) create mode 100644 kernel/x86_64/dscal.c create mode 100644 kernel/x86_64/dscal_microk_bulldozer-2.c diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index ef1108646..cd1665026 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c ZAXPYKERNEL = zaxpy.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c new file mode 100644 index 000000000..99001115b --- /dev/null +++ b/kernel/x86_64/dscal.c @@ -0,0 +1,143 @@ +/*************************************************************************** +Copyright (c) 2013 - 2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#if defined(BULLDOZER) +#include "dscal_microk_bulldozer-2.c" +#endif + + +#if !defined(HAVE_KERNEL_8) + +void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +{ + + BLASLONG i; + FLOAT alpha = *da; + + for( i=0; i 0 ) + { + if ( da == 0.0 ) + dscal_kernel_8_zero(n1 , &da , x); + else + dscal_kernel_8(n1 , &da , x); + } + + if ( da == 0.0 ) + { + for ( i=n1 ; i> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 256(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 320(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 7aee9139910a33a53718bd2c43db101caceefebe Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 16:27:43 +0200 Subject: [PATCH 2/6] added optimized dscal kernel for sandybridge --- kernel/x86_64/KERNEL.SANDYBRIDGE | 2 + kernel/x86_64/dscal.c | 2 + kernel/x86_64/dscal_microk_sandy-2.c | 206 +++++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 kernel/x86_64/dscal_microk_sandy-2.c diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE index 129d7e5c4..ea81979ac 100644 --- a/kernel/x86_64/KERNEL.SANDYBRIDGE +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SGERKERNEL = sger.c DGERKERNEL = dger.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 99001115b..be486a48e 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -29,6 +29,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(BULLDOZER) #include "dscal_microk_bulldozer-2.c" +#elif defined(SANDYBRIDGE) +#include "dscal_microk_sandy-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c new file mode 100644 index 000000000..f5bf5932f --- /dev/null +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 02e772c7e42c24fd84169787aca88eb257a535d7 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Tue, 12 May 2015 17:19:58 +0200 Subject: [PATCH 3/6] added optimized dscal kernel for haswell --- kernel/x86_64/KERNEL.HASWELL | 2 + kernel/x86_64/dscal.c | 2 + kernel/x86_64/dscal_microk_haswell-2.c | 206 +++++++++++++++++++++++++ 3 files changed, 210 insertions(+) create mode 100644 kernel/x86_64/dscal_microk_haswell-2.c diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL index 6849b05d9..188c51bf2 100644 --- a/kernel/x86_64/KERNEL.HASWELL +++ b/kernel/x86_64/KERNEL.HASWELL @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SGEMVNKERNEL = sgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index be486a48e..66a04ba8f 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -31,6 +31,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" +#elif defined(HASWELL) +#include "dscal_microk_haswell-2.c" #endif diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c new file mode 100644 index 000000000..07a9c804c --- /dev/null +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2014-2015, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vmovddup (%2), %%xmm0 \n\t" // alpha + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 4f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmulpd -64(%1), %%xmm0, %%xmm8 \n\t" + "vmulpd -48(%1), %%xmm0, %%xmm9 \n\t" + "vmulpd -32(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd -16(%1), %%xmm0, %%xmm11 \n\t" + + "subq $1 , %0 \n\t" + "jz 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + // "prefetcht0 640(%1) \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmulpd 0(%1), %%xmm0, %%xmm4 \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmulpd 16(%1), %%xmm0, %%xmm5 \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + "vmulpd 32(%1), %%xmm0, %%xmm6 \n\t" + + // "prefetcht0 704(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmulpd 48(%1), %%xmm0, %%xmm7 \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmulpd 64(%1), %%xmm0, %%xmm8 \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmulpd 80(%1), %%xmm0, %%xmm9 \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "vmulpd 96(%1), %%xmm0, %%xmm10 \n\t" + "vmulpd 112(%1), %%xmm0, %%xmm11 \n\t" + + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "vmovups %%xmm8 , -64(%1) \n\t" + "vmovups %%xmm9 , -48(%1) \n\t" + "vmovups %%xmm10 , -32(%1) \n\t" + "vmovups %%xmm11 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + + "4: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 5f \n\t" + + "vmulpd -128(%1), %%xmm0, %%xmm4 \n\t" + "vmulpd -112(%1), %%xmm0, %%xmm5 \n\t" + "vmulpd -96(%1), %%xmm0, %%xmm6 \n\t" + "vmulpd -80(%1), %%xmm0, %%xmm7 \n\t" + + "vmovups %%xmm4 ,-128(%1) \n\t" + "vmovups %%xmm5 ,-112(%1) \n\t" + "vmovups %%xmm6 , -96(%1) \n\t" + "vmovups %%xmm7 , -80(%1) \n\t" + + "5: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline)); + +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + + + BLASLONG n1 = n >> 4 ; + BLASLONG n2 = n & 8 ; + + __asm__ __volatile__ + ( + "vxorpd %%xmm0, %%xmm0 , %%xmm0 \n\t" + + "addq $128, %1 \n\t" + + "cmpq $0, %0 \n\t" + "je 2f \n\t" + + ".align 16 \n\t" + "1: \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "vmovups %%xmm0 , -64(%1) \n\t" + "vmovups %%xmm0 , -48(%1) \n\t" + "vmovups %%xmm0 , -32(%1) \n\t" + "vmovups %%xmm0 , -16(%1) \n\t" + + "addq $128, %1 \n\t" + "subq $1 , %0 \n\t" + "jnz 1b \n\t" + + "2: \n\t" + + "cmpq $8 ,%3 \n\t" + "jne 4f \n\t" + + "vmovups %%xmm0 ,-128(%1) \n\t" + "vmovups %%xmm0 ,-112(%1) \n\t" + "vmovups %%xmm0 , -96(%1) \n\t" + "vmovups %%xmm0 , -80(%1) \n\t" + + "4: \n\t" + + "vzeroupper \n\t" + + : + : + "r" (n1), // 0 + "r" (x), // 1 + "r" (alpha), // 2 + "r" (n2) // 3 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From 73f09bf64f9ae6a95a6ff5182d4b4262c95337d2 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 12:14:39 +0200 Subject: [PATCH 4/6] optimized dscal kernel for increment != 1 --- kernel/x86_64/dscal.c | 91 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 91 insertions(+) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index 66a04ba8f..d72a24b16 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -81,6 +81,77 @@ void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) #endif + +void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); + +void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + + FLOAT *x1; + BLASLONG inc_x3; + + inc_x <<= 3; + inc_x3 = (inc_x << 1) + inc_x; + + __asm__ __volatile__ + ( + "movddup (%3), %%xmm0 \n\t" // alpha + + "leaq (%1,%4,4), %2 \n\t" + + ".align 16 \n\t" + + "1: \n\t" + "movsd (%1) , %%xmm4 \n\t" + "movhpd (%1,%4,1), %%xmm4 \n\t" + "movsd (%1,%4,2), %%xmm5 \n\t" + "movhpd (%1,%5,1), %%xmm5 \n\t" + + "movsd (%2) , %%xmm6 \n\t" + "movhpd (%2,%4,1), %%xmm6 \n\t" + "movsd (%2,%4,2), %%xmm7 \n\t" + "movhpd (%2,%5,1), %%xmm7 \n\t" + + "mulpd %%xmm0, %%xmm4 \n\t" + "mulpd %%xmm0, %%xmm5 \n\t" + "mulpd %%xmm0, %%xmm6 \n\t" + "mulpd %%xmm0, %%xmm7 \n\t" + + "movsd %%xmm4 , (%1) \n\t" + "movhpd %%xmm4 , (%1,%4,1) \n\t" + "movsd %%xmm5 , (%1,%4,2) \n\t" + "movhpd %%xmm5 , (%1,%5,1) \n\t" + + "movsd %%xmm6 , (%2) \n\t" + "movhpd %%xmm6 , (%2,%4,1) \n\t" + "movsd %%xmm7 , (%2,%4,2) \n\t" + "movhpd %%xmm7 , (%2,%5,1) \n\t" + + "leaq (%1,%4,8), %1 \n\t" + "leaq (%2,%4,8), %2 \n\t" + + "subq $8, %0 \n\t" + "jnz 1b \n\t" + + : + : + "r" (n), // 0 + "r" (x), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + "r" (inc_x), // 4 + "r" (inc_x3) // 5 + : "cc", "%0", "%1", "%2", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", + "%xmm8", "%xmm9", "%xmm10", "%xmm11", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + + +} + int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -91,6 +162,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { + BLASLONG n1 = n & -2; + + while(j < n1) + { + + x[i]=0.0; + x[i+inc_x]=0.0; + i += 2*inc_x ; + j+=2; + + } + while(j < n) { @@ -103,6 +186,14 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { + BLASLONG n1 = n & -8; + if ( n1 > 0 ) + { + dscal_kernel_inc_8(n1, &da, x, inc_x); + i = n1 * inc_x; + j = n1; + } + while(j < n) { From e00cccc41e2615f785cdd6bad45032a78a411564 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 13:05:35 +0200 Subject: [PATCH 5/6] added optimized dscal kernel for piledriver --- kernel/x86_64/KERNEL.PILEDRIVER | 2 ++ kernel/x86_64/dscal.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/x86_64/KERNEL.PILEDRIVER b/kernel/x86_64/KERNEL.PILEDRIVER index be8b629d9..7c4c7cd43 100644 --- a/kernel/x86_64/KERNEL.PILEDRIVER +++ b/kernel/x86_64/KERNEL.PILEDRIVER @@ -1,3 +1,5 @@ +DSCALKERNEL = dscal.c + SAXPYKERNEL = saxpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = caxpy.c diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d72a24b16..a425cb710 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "dscal_microk_bulldozer-2.c" #elif defined(SANDYBRIDGE) #include "dscal_microk_sandy-2.c" From 18e90ee2e390c75eba72a1d9d069da4229b62af5 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Wed, 13 May 2015 13:31:26 +0200 Subject: [PATCH 6/6] bugfix: added static to functions --- kernel/x86_64/dscal.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index a425cb710..e3e2b0d58 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if !defined(HAVE_KERNEL_8) -void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) +static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) { BLASLONG i; @@ -60,7 +60,7 @@ void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x ) } -void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) +static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) { BLASLONG i; @@ -82,9 +82,9 @@ void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha , FLOAT *x ) #endif -void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); +static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) __attribute__ ((noinline)); -void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) { FLOAT *x1;