| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.4) | |||||
| project(OpenBLAS) | project(OpenBLAS) | ||||
| set(OpenBLAS_MAJOR_VERSION 0) | set(OpenBLAS_MAJOR_VERSION 0) | ||||
| set(OpenBLAS_MINOR_VERSION 2) | set(OpenBLAS_MINOR_VERSION 2) | ||||
| set(OpenBLAS_PATCH_VERSION 17.dev) | |||||
| set(OpenBLAS_PATCH_VERSION 18.dev) | |||||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | ||||
| enable_language(ASM) | enable_language(ASM) | ||||
| @@ -1,4 +1,10 @@ | |||||
| OpenBLAS ChangeLog | OpenBLAS ChangeLog | ||||
| ==================================================================== | |||||
| Version 0.2.17 | |||||
| 20-Mar-2016 | |||||
| common: | |||||
| * Enable BUILD_LAPACK_DEPRECATED=1 by default. | |||||
| ==================================================================== | ==================================================================== | ||||
| Version 0.2.16 | Version 0.2.16 | ||||
| 15-Mar-2016 | 15-Mar-2016 | ||||
| @@ -3,7 +3,7 @@ | |||||
| # | # | ||||
| # This library's version | # This library's version | ||||
| VERSION = 0.2.17.dev | |||||
| VERSION = 0.2.18.dev | |||||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | ||||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | ||||
| @@ -80,7 +80,7 @@ VERSION = 0.2.17.dev | |||||
| # NO_LAPACKE = 1 | # NO_LAPACKE = 1 | ||||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | # Build LAPACK Deprecated functions since LAPACK 3.6.0 | ||||
| # BUILD_LAPACK_DEPRECATED = 1 | |||||
| BUILD_LAPACK_DEPRECATED = 1 | |||||
| # If you want to use legacy threaded Level 3 implementation. | # If you want to use legacy threaded Level 3 implementation. | ||||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | # USE_SIMPLE_THREADED_LEVEL3 = 1 | ||||
| @@ -120,10 +120,10 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||||
| #CCOPYKERNEL = ../arm/zcopy.c | #CCOPYKERNEL = ../arm/zcopy.c | ||||
| #ZCOPYKERNEL = ../arm/zcopy.c | #ZCOPYKERNEL = ../arm/zcopy.c | ||||
| # | # | ||||
| #SDOTKERNEL = ../arm/dot.c | |||||
| #DDOTKERNEL = ../arm/dot.c | |||||
| SDOTKERNEL = sdot.c | |||||
| DDOTKERNEL = ddot.c | |||||
| #CDOTKERNEL = ../arm/zdot.c | #CDOTKERNEL = ../arm/zdot.c | ||||
| #ZDOTKERNEL = ../arm/zdot.c | |||||
| ZDOTKERNEL = zdot.c | |||||
| # | # | ||||
| #SNRM2KERNEL = ../arm/nrm2.c | #SNRM2KERNEL = ../arm/nrm2.c | ||||
| #DNRM2KERNEL = ../arm/nrm2.c | #DNRM2KERNEL = ../arm/nrm2.c | ||||
| @@ -0,0 +1,139 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/20 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(POWER8) | |||||
| #include "ddot_microk_power8.c" | |||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | |||||
| static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot = 0.0; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] | |||||
| + y[i+1] * x[i+1] | |||||
| + y[i+2] * x[i+2] | |||||
| + y[i+3] * x[i+3] | |||||
| + y[i+4] * x[i+4] | |||||
| + y[i+5] * x[i+5] | |||||
| + y[i+6] * x[i+6] | |||||
| + y[i+7] * x[i+7] ; | |||||
| i+=8 ; | |||||
| } | |||||
| *d += dot; | |||||
| } | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT dot = 0.0 ; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -16; | |||||
| if ( n1 ) | |||||
| ddot_kernel_8(n1, x, y , &dot ); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| FLOAT temp1 = 0.0; | |||||
| FLOAT temp2 = 0.0; | |||||
| BLASLONG n1 = n & -4; | |||||
| while(i < n1) | |||||
| { | |||||
| FLOAT m1 = y[iy] * x[ix] ; | |||||
| FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; | |||||
| FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; | |||||
| FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; | |||||
| ix += inc_x*4 ; | |||||
| iy += inc_y*4 ; | |||||
| temp1 += m1+m3; | |||||
| temp2 += m2+m4; | |||||
| i+=4 ; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| temp1 += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| dot = temp1 + temp2; | |||||
| return(dot); | |||||
| } | |||||
| @@ -0,0 +1,178 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/20 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||||
| static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| BLASLONG i = n; | |||||
| BLASLONG o16 = 16; | |||||
| BLASLONG o32 = 32; | |||||
| BLASLONG o48 = 48; | |||||
| BLASLONG o64 = 64; | |||||
| BLASLONG o80 = 80; | |||||
| BLASLONG o96 = 96; | |||||
| BLASLONG o112 = 112; | |||||
| FLOAT *x1=x; | |||||
| FLOAT *y1=y; | |||||
| BLASLONG pre = 384; | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "xxlxor 32,32,32 \n\t" | |||||
| "xxlxor 33,33,33 \n\t" | |||||
| "xxlxor 34,34,34 \n\t" | |||||
| "xxlxor 35,35,35 \n\t" | |||||
| "xxlxor 36,36,36 \n\t" | |||||
| "xxlxor 37,37,37 \n\t" | |||||
| "xxlxor 38,38,38 \n\t" | |||||
| "xxlxor 39,39,39 \n\t" | |||||
| "dcbt %2, %12 \n\t" | |||||
| "dcbt %3, %12 \n\t" | |||||
| "lxvd2x 40, 0, %2 \n\t" | |||||
| "lxvd2x 48, 0, %3 \n\t" | |||||
| "lxvd2x 41, %5, %2 \n\t" | |||||
| "lxvd2x 49, %5, %3 \n\t" | |||||
| "lxvd2x 42, %6, %2 \n\t" | |||||
| "lxvd2x 50, %6, %3 \n\t" | |||||
| "lxvd2x 43, %7, %2 \n\t" | |||||
| "lxvd2x 51, %7, %3 \n\t" | |||||
| "lxvd2x 44, %8, %2 \n\t" | |||||
| "lxvd2x 52, %8, %3 \n\t" | |||||
| "lxvd2x 45, %9, %2 \n\t" | |||||
| "lxvd2x 53, %9, %3 \n\t" | |||||
| "lxvd2x 46, %10, %2 \n\t" | |||||
| "lxvd2x 54, %10, %3 \n\t" | |||||
| "lxvd2x 47, %11, %2 \n\t" | |||||
| "lxvd2x 55, %11, %3 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %0 , %0 , -16 \n\t" | |||||
| "ble 2f \n\t" | |||||
| ".align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "dcbt %2, %12 \n\t" | |||||
| "dcbt %3, %12 \n\t" | |||||
| "xvmaddadp 32, 40, 48 \n\t" | |||||
| "lxvd2x 40, 0, %2 \n\t" | |||||
| "lxvd2x 48, 0, %3 \n\t" | |||||
| "xvmaddadp 33, 41, 49 \n\t" | |||||
| "lxvd2x 41, %5, %2 \n\t" | |||||
| "lxvd2x 49, %5, %3 \n\t" | |||||
| "xvmaddadp 34, 42, 50 \n\t" | |||||
| "lxvd2x 42, %6, %2 \n\t" | |||||
| "lxvd2x 50, %6, %3 \n\t" | |||||
| "xvmaddadp 35, 43, 51 \n\t" | |||||
| "lxvd2x 43, %7, %2 \n\t" | |||||
| "lxvd2x 51, %7, %3 \n\t" | |||||
| "xvmaddadp 36, 44, 52 \n\t" | |||||
| "lxvd2x 44, %8, %2 \n\t" | |||||
| "lxvd2x 52, %8, %3 \n\t" | |||||
| "xvmaddadp 37, 45, 53 \n\t" | |||||
| "lxvd2x 45, %9, %2 \n\t" | |||||
| "lxvd2x 53, %9, %3 \n\t" | |||||
| "xvmaddadp 38, 46, 54 \n\t" | |||||
| "lxvd2x 46, %10, %2 \n\t" | |||||
| "lxvd2x 54, %10, %3 \n\t" | |||||
| "xvmaddadp 39, 47, 55 \n\t" | |||||
| "lxvd2x 47, %11, %2 \n\t" | |||||
| "lxvd2x 55, %11, %3 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %0 , %0 , -16 \n\t" | |||||
| "bgt 1b \n\t" | |||||
| "2: \n\t" | |||||
| "xvmaddadp 32, 40, 48 \n\t" | |||||
| "xvmaddadp 33, 41, 49 \n\t" | |||||
| "xvmaddadp 34, 42, 50 \n\t" | |||||
| "xvmaddadp 35, 43, 51 \n\t" | |||||
| "xvmaddadp 36, 44, 52 \n\t" | |||||
| "xvmaddadp 37, 45, 53 \n\t" | |||||
| "xvmaddadp 38, 46, 54 \n\t" | |||||
| "xvmaddadp 39, 47, 55 \n\t" | |||||
| "xvadddp 32, 32, 33 \n\t" | |||||
| "xvadddp 34, 34, 35 \n\t" | |||||
| "xvadddp 36, 36, 37 \n\t" | |||||
| "xvadddp 38, 38, 39 \n\t" | |||||
| "xvadddp 32, 32, 34 \n\t" | |||||
| "xvadddp 36, 36, 38 \n\t" | |||||
| "xvadddp 32, 32, 36 \n\t" | |||||
| "xxswapd 33, 32 \n\t" | |||||
| "xsadddp 32, 32, 33 \n\t" | |||||
| "stxsdx 32, 0, %4 \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x1), // 2 | |||||
| "r" (y1), // 3 | |||||
| "r" (dot), // 4 | |||||
| "r" (o16), // 5 | |||||
| "r" (o32), // 6 | |||||
| "r" (o48), // 7 | |||||
| "r" (o64), // 8 | |||||
| "r" (o80), // 9 | |||||
| "r" (o96), // 10 | |||||
| "r" (o112), // 11 | |||||
| "r" (pre) // 12 | |||||
| : "cr0", "%0", "%2" , "%3", "memory" | |||||
| ); | |||||
| } | |||||
| @@ -0,0 +1,126 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #if defined(POWER8) | |||||
| #include "sdot_microk_power8.c" | |||||
| #endif | |||||
| #ifndef HAVE_KERNEL_16 | |||||
| static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot = 0.0; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] | |||||
| + y[i+1] * x[i+1] | |||||
| + y[i+2] * x[i+2] | |||||
| + y[i+3] * x[i+3] | |||||
| + y[i+4] * x[i+4] | |||||
| + y[i+5] * x[i+5] | |||||
| + y[i+6] * x[i+6] | |||||
| + y[i+7] * x[i+7] ; | |||||
| i+=8 ; | |||||
| } | |||||
| *d += dot; | |||||
| } | |||||
| #endif | |||||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i=0; | |||||
| BLASLONG ix=0,iy=0; | |||||
| FLOAT dot = 0.0 ; | |||||
| if ( n <= 0 ) return(dot); | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -32; | |||||
| if ( n1 ) | |||||
| sdot_kernel_16(n1, x, y , &dot ); | |||||
| i = n1; | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[i] * x[i] ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| BLASLONG n1 = n & -2; | |||||
| while(i < n1) | |||||
| { | |||||
| dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; | |||||
| ix += inc_x*2 ; | |||||
| iy += inc_y*2 ; | |||||
| i+=2 ; | |||||
| } | |||||
| while(i < n) | |||||
| { | |||||
| dot += y[iy] * x[ix] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| return(dot); | |||||
| } | |||||
| @@ -0,0 +1,179 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #define HAVE_KERNEL_16 1 | |||||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||||
| static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| BLASLONG i = n; | |||||
| BLASLONG o16 = 16; | |||||
| BLASLONG o32 = 32; | |||||
| BLASLONG o48 = 48; | |||||
| BLASLONG o64 = 64; | |||||
| BLASLONG o80 = 80; | |||||
| BLASLONG o96 = 96; | |||||
| BLASLONG o112 = 112; | |||||
| FLOAT *x1=x; | |||||
| FLOAT *y1=y; | |||||
| BLASLONG pre = 384; | |||||
| FLOAT tempdot[4]; | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "xxlxor 32,32,32 \n\t" | |||||
| "xxlxor 33,33,33 \n\t" | |||||
| "xxlxor 34,34,34 \n\t" | |||||
| "xxlxor 35,35,35 \n\t" | |||||
| "xxlxor 36,36,36 \n\t" | |||||
| "xxlxor 37,37,37 \n\t" | |||||
| "xxlxor 38,38,38 \n\t" | |||||
| "xxlxor 39,39,39 \n\t" | |||||
| "dcbt %2, %12 \n\t" | |||||
| "dcbt %3, %12 \n\t" | |||||
| "lxvw4x 40, 0, %2 \n\t" | |||||
| "lxvw4x 48, 0, %3 \n\t" | |||||
| "lxvw4x 41, %5, %2 \n\t" | |||||
| "lxvw4x 49, %5, %3 \n\t" | |||||
| "lxvw4x 42, %6, %2 \n\t" | |||||
| "lxvw4x 50, %6, %3 \n\t" | |||||
| "lxvw4x 43, %7, %2 \n\t" | |||||
| "lxvw4x 51, %7, %3 \n\t" | |||||
| "lxvw4x 44, %8, %2 \n\t" | |||||
| "lxvw4x 52, %8, %3 \n\t" | |||||
| "lxvw4x 45, %9, %2 \n\t" | |||||
| "lxvw4x 53, %9, %3 \n\t" | |||||
| "lxvw4x 46, %10, %2 \n\t" | |||||
| "lxvw4x 54, %10, %3 \n\t" | |||||
| "lxvw4x 47, %11, %2 \n\t" | |||||
| "lxvw4x 55, %11, %3 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %0 , %0 , -32 \n\t" | |||||
| "ble 2f \n\t" | |||||
| ".align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "dcbt %2, %12 \n\t" | |||||
| "dcbt %3, %12 \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" | |||||
| "lxvw4x 40, 0, %2 \n\t" | |||||
| "lxvw4x 48, 0, %3 \n\t" | |||||
| "xvmaddasp 33, 41, 49 \n\t" | |||||
| "lxvw4x 41, %5, %2 \n\t" | |||||
| "lxvw4x 49, %5, %3 \n\t" | |||||
| "xvmaddasp 34, 42, 50 \n\t" | |||||
| "lxvw4x 42, %6, %2 \n\t" | |||||
| "lxvw4x 50, %6, %3 \n\t" | |||||
| "xvmaddasp 35, 43, 51 \n\t" | |||||
| "lxvw4x 43, %7, %2 \n\t" | |||||
| "lxvw4x 51, %7, %3 \n\t" | |||||
| "xvmaddasp 36, 44, 52 \n\t" | |||||
| "lxvw4x 44, %8, %2 \n\t" | |||||
| "lxvw4x 52, %8, %3 \n\t" | |||||
| "xvmaddasp 37, 45, 53 \n\t" | |||||
| "lxvw4x 45, %9, %2 \n\t" | |||||
| "lxvw4x 53, %9, %3 \n\t" | |||||
| "xvmaddasp 38, 46, 54 \n\t" | |||||
| "lxvw4x 46, %10, %2 \n\t" | |||||
| "lxvw4x 54, %10, %3 \n\t" | |||||
| "xvmaddasp 39, 47, 55 \n\t" | |||||
| "lxvw4x 47, %11, %2 \n\t" | |||||
| "lxvw4x 55, %11, %3 \n\t" | |||||
| "addi %2, %2, 128 \n\t" | |||||
| "addi %3, %3, 128 \n\t" | |||||
| "addic. %0 , %0 , -32 \n\t" | |||||
| "bgt 1b \n\t" | |||||
| "2: \n\t" | |||||
| "xvmaddasp 32, 40, 48 \n\t" | |||||
| "xvmaddasp 33, 41, 49 \n\t" | |||||
| "xvmaddasp 34, 42, 50 \n\t" | |||||
| "xvmaddasp 35, 43, 51 \n\t" | |||||
| "xvmaddasp 36, 44, 52 \n\t" | |||||
| "xvmaddasp 37, 45, 53 \n\t" | |||||
| "xvmaddasp 38, 46, 54 \n\t" | |||||
| "xvmaddasp 39, 47, 55 \n\t" | |||||
| "xvaddsp 32, 32 , 33 \n\t" | |||||
| "xvaddsp 34, 34 , 35 \n\t" | |||||
| "xvaddsp 36, 36 , 37 \n\t" | |||||
| "xvaddsp 38, 38 , 39 \n\t" | |||||
| "xvaddsp 32, 32 , 34 \n\t" | |||||
| "xvaddsp 36, 36 , 38 \n\t" | |||||
| "xvaddsp 32, 32 , 36 \n\t" | |||||
| "stxvw4x 32, 0 , %4 \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x1), // 2 | |||||
| "r" (y1), // 3 | |||||
| "r" (tempdot), // 4 | |||||
| "r" (o16), // 5 | |||||
| "r" (o32), // 6 | |||||
| "r" (o48), // 7 | |||||
| "r" (o64), // 8 | |||||
| "r" (o80), // 9 | |||||
| "r" (o96), // 10 | |||||
| "r" (o112), // 11 | |||||
| "r" (pre) // 12 | |||||
| : "cr0", "%0", "%2" , "%3", "memory" | |||||
| ); | |||||
| *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3]; | |||||
| } | |||||
| @@ -0,0 +1,167 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #include "common.h" | |||||
| #include <complex.h> | |||||
| #if defined(POWER8) | |||||
| #include "zdot_microk_power8.c" | |||||
| #endif | |||||
| #ifndef HAVE_KERNEL_8 | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline)); | |||||
| static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) | |||||
| { | |||||
| BLASLONG register i = 0; | |||||
| FLOAT dot[4] = { 0.0, 0.0, 0.0, 0.0 }; | |||||
| BLASLONG j=0; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[2] += x[j] * y[j+1] ; | |||||
| dot[3] += x[j+1] * y[j] ; | |||||
| dot[0] += x[j+2] * y[j+2] ; | |||||
| dot[1] += x[j+3] * y[j+3] ; | |||||
| dot[2] += x[j+2] * y[j+3] ; | |||||
| dot[3] += x[j+3] * y[j+2] ; | |||||
| dot[0] += x[j+4] * y[j+4] ; | |||||
| dot[1] += x[j+5] * y[j+5] ; | |||||
| dot[2] += x[j+4] * y[j+5] ; | |||||
| dot[3] += x[j+5] * y[j+4] ; | |||||
| dot[0] += x[j+6] * y[j+6] ; | |||||
| dot[1] += x[j+7] * y[j+7] ; | |||||
| dot[2] += x[j+6] * y[j+7] ; | |||||
| dot[3] += x[j+7] * y[j+6] ; | |||||
| j+=8; | |||||
| i+=4; | |||||
| } | |||||
| d[0] = dot[0]; | |||||
| d[1] = dot[1]; | |||||
| d[2] = dot[2]; | |||||
| d[3] = dot[3]; | |||||
| } | |||||
| #endif | |||||
| FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) | |||||
| { | |||||
| BLASLONG i; | |||||
| BLASLONG ix,iy; | |||||
| FLOAT _Complex result; | |||||
| FLOAT dot[4] = { 0.0, 0.0, 0.0 , 0.0 } ; | |||||
| if ( n <= 0 ) | |||||
| { | |||||
| __real__ result = 0.0 ; | |||||
| __imag__ result = 0.0 ; | |||||
| return(result); | |||||
| } | |||||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||||
| { | |||||
| BLASLONG n1 = n & -8; | |||||
| if ( n1 ) | |||||
| zdot_kernel_8(n1, x, y , dot ); | |||||
| i = n1; | |||||
| BLASLONG j = i * 2; | |||||
| while( i < n ) | |||||
| { | |||||
| dot[0] += x[j] * y[j] ; | |||||
| dot[1] += x[j+1] * y[j+1] ; | |||||
| dot[2] += x[j] * y[j+1] ; | |||||
| dot[3] += x[j+1] * y[j] ; | |||||
| j+=2; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| else | |||||
| { | |||||
| i=0; | |||||
| ix=0; | |||||
| iy=0; | |||||
| inc_x <<= 1; | |||||
| inc_y <<= 1; | |||||
| while(i < n) | |||||
| { | |||||
| dot[0] += x[ix] * y[iy] ; | |||||
| dot[1] += x[ix+1] * y[iy+1] ; | |||||
| dot[2] += x[ix] * y[iy+1] ; | |||||
| dot[3] += x[ix+1] * y[iy] ; | |||||
| ix += inc_x ; | |||||
| iy += inc_y ; | |||||
| i++ ; | |||||
| } | |||||
| } | |||||
| #if !defined(CONJ) | |||||
| __real__ result = dot[0] - dot[1]; | |||||
| __imag__ result = dot[2] + dot[3]; | |||||
| #else | |||||
| __real__ result = dot[0] + dot[1]; | |||||
| __imag__ result = dot[2] - dot[3]; | |||||
| #endif | |||||
| return(result); | |||||
| } | |||||
| @@ -0,0 +1,219 @@ | |||||
| /*************************************************************************** | |||||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||||
| All rights reserved. | |||||
| Redistribution and use in source and binary forms, with or without | |||||
| modification, are permitted provided that the following conditions are | |||||
| met: | |||||
| 1. Redistributions of source code must retain the above copyright | |||||
| notice, this list of conditions and the following disclaimer. | |||||
| 2. Redistributions in binary form must reproduce the above copyright | |||||
| notice, this list of conditions and the following disclaimer in | |||||
| the documentation and/or other materials provided with the | |||||
| distribution. | |||||
| 3. Neither the name of the OpenBLAS project nor the names of | |||||
| its contributors may be used to endorse or promote products | |||||
| derived from this software without specific prior written permission. | |||||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||||
| *****************************************************************************/ | |||||
| /************************************************************************************** | |||||
| * 2016/03/21 Werner Saar (wernsaar@googlemail.com) | |||||
| * BLASTEST : OK | |||||
| * CTEST : OK | |||||
| * TEST : OK | |||||
| * LAPACK-TEST : OK | |||||
| **************************************************************************************/ | |||||
| #define HAVE_KERNEL_8 1 | |||||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline)); | |||||
| static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||||
| { | |||||
| BLASLONG i = n; | |||||
| BLASLONG o16 = 16; | |||||
| BLASLONG o32 = 32; | |||||
| BLASLONG o48 = 48; | |||||
| FLOAT *x1=x; | |||||
| FLOAT *y1=y; | |||||
| BLASLONG pre = 384; | |||||
| __asm__ __volatile__ | |||||
| ( | |||||
| "xxlxor 32,32,32 \n\t" | |||||
| "xxlxor 33,33,33 \n\t" | |||||
| "xxlxor 34,34,34 \n\t" | |||||
| "xxlxor 35,35,35 \n\t" | |||||
| "xxlxor 36,36,36 \n\t" | |||||
| "xxlxor 37,37,37 \n\t" | |||||
| "xxlxor 38,38,38 \n\t" | |||||
| "xxlxor 39,39,39 \n\t" | |||||
| "dcbt %2, %8 \n\t" | |||||
| "dcbt %3, %8 \n\t" | |||||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||||
| "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i | |||||
| "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i | |||||
| "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i | |||||
| "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i | |||||
| "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i | |||||
| "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i | |||||
| "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i | |||||
| "xxswapd 52,48 \n\t" // y0_i, y0_r | |||||
| "xxswapd 53,49 \n\t" // y1_i, y1_r | |||||
| "xxswapd 54,50 \n\t" // y2_i, y2_r | |||||
| "xxswapd 55,51 \n\t" // y3_i, y3_r | |||||
| "addi %2, %2, 64 \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i | |||||
| "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i | |||||
| "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i | |||||
| "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i | |||||
| "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i | |||||
| "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i | |||||
| "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i | |||||
| "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i | |||||
| "xxswapd 60,56 \n\t" // y0_i, y0_r | |||||
| "xxswapd 61,57 \n\t" // y1_i, y1_r | |||||
| "xxswapd 62,58 \n\t" // y2_i, y2_r | |||||
| "xxswapd 63,59 \n\t" // y3_i, y3_r | |||||
| "addi %2, %2, 64 \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "addic. %0 , %0 , -8 \n\t" | |||||
| "ble 2f \n\t" | |||||
| ".align 5 \n\t" | |||||
| "1: \n\t" | |||||
| "dcbt %2, %8 \n\t" | |||||
| "dcbt %3, %8 \n\t" | |||||
| "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i | |||||
| "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i | |||||
| "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i | |||||
| "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i | |||||
| "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i | |||||
| "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i | |||||
| "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i | |||||
| "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i | |||||
| "xxswapd 52,48 \n\t" // y0_i, y0_r | |||||
| "xxswapd 53,49 \n\t" // y1_i, y1_r | |||||
| "addi %2, %2, 64 \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "xxswapd 54,50 \n\t" // y2_i, y2_r | |||||
| "xxswapd 55,51 \n\t" // y3_i, y3_r | |||||
| "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i | |||||
| "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i | |||||
| "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i | |||||
| "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i | |||||
| "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i | |||||
| "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i | |||||
| "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i | |||||
| "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i | |||||
| "xxswapd 60,56 \n\t" // y0_i, y0_r | |||||
| "xxswapd 61,57 \n\t" // y1_i, y1_r | |||||
| "addi %2, %2, 64 \n\t" | |||||
| "addi %3, %3, 64 \n\t" | |||||
| "xxswapd 62,58 \n\t" // y2_i, y2_r | |||||
| "xxswapd 63,59 \n\t" // y3_i, y3_r | |||||
| "addic. %0 , %0 , -8 \n\t" | |||||
| "bgt 1b \n\t" | |||||
| "2: \n\t" | |||||
| "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i | |||||
| "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i | |||||
| "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i | |||||
| "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i | |||||
| "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r | |||||
| "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r | |||||
| "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r | |||||
| "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r | |||||
| "xvadddp 32, 32, 34 \n\t" | |||||
| "xvadddp 36, 36, 38 \n\t" | |||||
| "xvadddp 33, 33, 35 \n\t" | |||||
| "xvadddp 37, 37, 39 \n\t" | |||||
| "xvadddp 32, 32, 36 \n\t" | |||||
| "xvadddp 33, 33, 37 \n\t" | |||||
| "stxvd2x 32, 0, %4 \n\t" | |||||
| "stxvd2x 33, %5, %4 \n\t" | |||||
| : | |||||
| : | |||||
| "r" (i), // 0 | |||||
| "r" (n), // 1 | |||||
| "r" (x1), // 2 | |||||
| "r" (y1), // 3 | |||||
| "r" (dot), // 4 | |||||
| "r" (o16), // 5 | |||||
| "r" (o32), // 6 | |||||
| "r" (o48), // 7 | |||||
| "r" (pre) // 8 | |||||
| : "cr0", "%0", "%2" , "%3", "memory" | |||||
| ); | |||||
| } | |||||