From 23229011db2ab03c7643f1e0a007efc8e0276201 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 6 Aug 2018 18:20:40 +0300 Subject: [PATCH 001/133] [ZARCH] Z14 support, BLAS 1/2 single precision implementations, Some missing double precision implementations, Gemv optimization --- Makefile.zarch | 4 + cpuid_zarch.c | 35 +- kernel/zarch/KERNEL.Z13 | 20 +- kernel/zarch/KERNEL.Z14 | 146 +++++++ kernel/zarch/camax.c | 269 +++++++++++++ kernel/zarch/camin.c | 269 +++++++++++++ kernel/zarch/casum.c | 167 ++++++++ kernel/zarch/caxpy.c | 174 +++++++++ kernel/zarch/ccopy.c | 99 +++++ kernel/zarch/cdot.c | 182 +++++++++ kernel/zarch/crot.c | 256 ++++++++++++ kernel/zarch/cscal.c | 456 +++++++++++++++++++++ kernel/zarch/cswap.c | 183 +++++++++ kernel/zarch/damax.c | 206 ++++++++++ kernel/zarch/damin.c | 206 ++++++++++ kernel/zarch/dasum.c | 158 ++++---- kernel/zarch/daxpy.c | 177 ++++----- kernel/zarch/dcopy.c | 122 +----- kernel/zarch/ddot.c | 155 +++----- kernel/zarch/dgemv_n_4.c | 516 ++++++++++++++++-------- kernel/zarch/dgemv_t_4.c | 578 ++++++++++++++++++++------- kernel/zarch/dmax.c | 182 +++++++++ kernel/zarch/dmin.c | 182 +++++++++ kernel/zarch/drot.c | 338 ++++++++-------- kernel/zarch/dscal.c | 200 ++++------ kernel/zarch/dsdot.c | 180 +++++++++ kernel/zarch/dswap.c | 292 ++++---------- kernel/zarch/icamax.c | 319 +++++++++++++++ kernel/zarch/icamin.c | 319 +++++++++++++++ kernel/zarch/idamax.c | 295 +++++++------- kernel/zarch/idamin.c | 325 ++++++++------- kernel/zarch/idmax.c | 232 +++++++++++ kernel/zarch/idmin.c | 232 +++++++++++ kernel/zarch/isamax.c | 299 ++++++++++++++ kernel/zarch/isamin.c | 299 ++++++++++++++ kernel/zarch/ismax.c | 275 +++++++++++++ kernel/zarch/ismin.c | 275 +++++++++++++ kernel/zarch/izamax.c | 334 ++++++++-------- kernel/zarch/izamin.c | 400 +++++++++---------- kernel/zarch/samax.c | 210 ++++++++++ kernel/zarch/samin.c | 210 ++++++++++ kernel/zarch/sasum.c | 174 +++++++++ kernel/zarch/saxpy.c | 184 +++++++++ kernel/zarch/scopy.c | 85 ++++ kernel/zarch/sdot.c | 140 +++++++ kernel/zarch/sgemv_n_4.c | 668 +++++++++++++++++++++++++++++++ kernel/zarch/sgemv_t_4.c | 826 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/smax.c | 186 +++++++++ kernel/zarch/smin.c | 186 +++++++++ kernel/zarch/srot.c | 246 ++++++++++++ kernel/zarch/sscal.c | 201 ++++++++++ kernel/zarch/sswap.c | 164 ++++++++ kernel/zarch/zamax.c | 221 +++++++++++ kernel/zarch/zamin.c | 221 +++++++++++ kernel/zarch/zasum.c | 152 +++---- kernel/zarch/zaxpy.c | 216 +++++----- kernel/zarch/zcopy.c | 86 +--- kernel/zarch/zdot.c | 213 ++++------ kernel/zarch/zrot.c | 339 ++++++++-------- kernel/zarch/zscal.c | 460 ++++++++++------------ kernel/zarch/zswap.c | 291 ++++---------- ztest/Makefile | 437 +++++++++++++++++++++ ztest/amax.c | 235 +++++++++++ ztest/amin.c | 235 +++++++++++ ztest/asum.c | 263 +++++++++++++ ztest/axpy.c | 303 ++++++++++++++ ztest/copy.c | 291 ++++++++++++++ ztest/dot.c | 296 ++++++++++++++ ztest/dsdot.c | 229 +++++++++++ ztest/gemv.c | 618 +++++++++++++++++++++++++++++ ztest/iamax.c | 284 ++++++++++++++ ztest/iamin.c | 284 ++++++++++++++ ztest/imax.c | 231 +++++++++++ ztest/imin.c | 231 +++++++++++ ztest/max.c | 229 +++++++++++ ztest/min.c | 229 +++++++++++ ztest/rot.c | 303 ++++++++++++++ ztest/scal.c | 308 +++++++++++++++ ztest/swap.c | 306 +++++++++++++++ 79 files changed, 17382 insertions(+), 2965 deletions(-) create mode 100644 kernel/zarch/KERNEL.Z14 create mode 100644 kernel/zarch/camax.c create mode 100644 kernel/zarch/camin.c create mode 100644 kernel/zarch/casum.c create mode 100644 kernel/zarch/caxpy.c create mode 100644 kernel/zarch/ccopy.c create mode 100644 kernel/zarch/cdot.c create mode 100644 kernel/zarch/crot.c create mode 100644 kernel/zarch/cscal.c create mode 100644 kernel/zarch/cswap.c create mode 100644 kernel/zarch/damax.c create mode 100644 kernel/zarch/damin.c create mode 100644 kernel/zarch/dmax.c create mode 100644 kernel/zarch/dmin.c create mode 100644 kernel/zarch/dsdot.c create mode 100644 kernel/zarch/icamax.c create mode 100644 kernel/zarch/icamin.c create mode 100644 kernel/zarch/idmax.c create mode 100644 kernel/zarch/idmin.c create mode 100644 kernel/zarch/isamax.c create mode 100644 kernel/zarch/isamin.c create mode 100644 kernel/zarch/ismax.c create mode 100644 kernel/zarch/ismin.c create mode 100644 kernel/zarch/samax.c create mode 100644 kernel/zarch/samin.c create mode 100644 kernel/zarch/sasum.c create mode 100644 kernel/zarch/saxpy.c create mode 100644 kernel/zarch/scopy.c create mode 100644 kernel/zarch/sdot.c create mode 100644 kernel/zarch/sgemv_n_4.c create mode 100644 kernel/zarch/sgemv_t_4.c create mode 100644 kernel/zarch/smax.c create mode 100644 kernel/zarch/smin.c create mode 100644 kernel/zarch/srot.c create mode 100644 kernel/zarch/sscal.c create mode 100644 kernel/zarch/sswap.c create mode 100644 kernel/zarch/zamax.c create mode 100644 kernel/zarch/zamin.c create mode 100644 ztest/Makefile create mode 100644 ztest/amax.c create mode 100644 ztest/amin.c create mode 100644 ztest/asum.c create mode 100644 ztest/axpy.c create mode 100644 ztest/copy.c create mode 100644 ztest/dot.c create mode 100644 ztest/dsdot.c create mode 100644 ztest/gemv.c create mode 100644 ztest/iamax.c create mode 100644 ztest/iamin.c create mode 100644 ztest/imax.c create mode 100644 ztest/imin.c create mode 100644 ztest/max.c create mode 100644 ztest/min.c create mode 100644 ztest/rot.c create mode 100644 ztest/scal.c create mode 100644 ztest/swap.c diff --git a/Makefile.zarch b/Makefile.zarch index 9ec9dc79f..47ea1eb71 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif +ifeq ($(CORE), Z14) +CCOMMON_OPT += -march=z14 -mzvector +FCOMMON_OPT += -march=z14 -mzvector +endif diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 4e1935429..0ae32f27d 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -29,40 +29,25 @@ #define CPU_GENERIC 0 #define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", - "Z13" + "Z13", + "Z14" }; static char *cpuname_lower[] = { "zarch_generic", - "z13" + "z13", + "z14" }; int detect(void) { - FILE *infile; - char buffer[512], *p; - - p = (char *)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("Type", buffer, 4)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if (strstr(p, "2964")) return CPU_Z13; - if (strstr(p, "2965")) return CPU_Z13; - - return CPU_GENERIC; + // return CPU_GENERIC; + return CPU_Z14; + } void get_libname(void) @@ -107,5 +92,9 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; + case CPU_Z14: + printf("#define Z14\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + break; } } diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index add628bfe..d39b9d904 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = ../arm/amax.c +DAMAXKERNEL = damax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = ../arm/zamax.c +ZAMAXKERNEL = zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c +DAMINKERNEL = damin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c +DMAXKERNEL = dmax.c SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c +DMINKERNEL = dmin.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,10 +25,10 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = idmax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c +IDMINKERNEL = idmin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = ../arm/zgemv_n.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c +ZGEMVTKERNEL = ../arm/zgemv_t.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 new file mode 100644 index 000000000..fa88b6881 --- /dev/null +++ b/kernel/zarch/KERNEL.Z14 @@ -0,0 +1,146 @@ +SAMAXKERNEL = samax.c +DAMAXKERNEL = damax.c +CAMAXKERNEL = camax.c +ZAMAXKERNEL = zamax.c + +SAMINKERNEL = samin.c +DAMINKERNEL = damin.c +CAMINKERNEL = camin.c +ZAMINKERNEL = zamin.c + +SMAXKERNEL = smax.c +DMAXKERNEL = dmax.c + +SMINKERNEL = smin.c +DMINKERNEL = dmin.c + +ISAMAXKERNEL = isamax.c +IDAMAXKERNEL = idamax.c +ICAMAXKERNEL = icamax.c +IZAMAXKERNEL = izamax.c + +ISAMINKERNEL = isamin.c +IDAMINKERNEL = idamin.c +ICAMINKERNEL = icamin.c +IZAMINKERNEL = izamin.c + +ISMAXKERNEL = ismax.c +IDMAXKERNEL = idmax.c + +ISMINKERNEL = ismin.c +IDMINKERNEL = idmin.c + +SASUMKERNEL = sasum.c +DASUMKERNEL = dasum.c +CASUMKERNEL = casum.c +ZASUMKERNEL = zasum.c + +SAXPYKERNEL = saxpy.c +DAXPYKERNEL = daxpy.c +CAXPYKERNEL = caxpy.c +ZAXPYKERNEL = zaxpy.c + +SCOPYKERNEL = scopy.c +DCOPYKERNEL = dcopy.c +CCOPYKERNEL = ccopy.c +ZCOPYKERNEL = zcopy.c + +SDOTKERNEL = sdot.c +DDOTKERNEL = ddot.c +CDOTKERNEL = cdot.c +ZDOTKERNEL = zdot.c +DSDOTKERNEL = dsdot.c + +SNRM2KERNEL = ../arm/nrm2.c +DNRM2KERNEL = ../arm/nrm2.c +CNRM2KERNEL = ../arm/znrm2.c +ZNRM2KERNEL = ../arm/znrm2.c + +SROTKERNEL = srot.c +DROTKERNEL = drot.c +CROTKERNEL = crot.c +ZROTKERNEL = zrot.c + +SSCALKERNEL = sscal.c +DSCALKERNEL = dscal.c +CSCALKERNEL = cscal.c +ZSCALKERNEL = zscal.c + +SSWAPKERNEL = sswap.c +DSWAPKERNEL = dswap.c +CSWAPKERNEL = cswap.c +ZSWAPKERNEL = zswap.c + +SGEMVNKERNEL = sgemv_n_4.c +DGEMVNKERNEL = dgemv_n_4.c +CGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = ../arm/zgemv_n.c + +SGEMVTKERNEL = sgemv_t_4.c +DGEMVTKERNEL = dgemv_t_4.c +CGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = ../arm/zgemv_t.c + +STRMMKERNEL = strmm8x4V.S +DTRMMKERNEL = trmm8x4V.S +CTRMMKERNEL = ctrmm4x4V.S +ZTRMMKERNEL = ztrmm4x4V.S + +SGEMMKERNEL = strmm8x4V.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o + + + +DGEMMKERNEL = gemm8x4V.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o + +CGEMMKERNEL = ctrmm4x4V.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o + +ZGEMMKERNEL = ztrmm4x4V.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + + + diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c new file mode 100644 index 000000000..6394be769 --- /dev/null +++ b/kernel/zarch/camax.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),1 \n\t" + "vlef %%v17,140(%%r1,%2),1 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),1 \n\t" + "vlef %%v19,172(%%r1,%2),1 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),1 \n\t" + "vlef %%v21,204(%%r1,%2),1 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),1 \n\t" + "vlef %%v23,236(%%r1,%2),1 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = camax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c new file mode 100644 index 000000000..936c300c8 --- /dev/null +++ b/kernel/zarch/camin.c @@ -0,0 +1,269 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vlef %%v0,0(%2),0 \n\t" + "vlef %%v16,4(%2),0 \n\t" + "vlef %%v0,8(%2),0 \n\t" + "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,16(%2),2 \n\t" + "vlef %%v16,20(%2),2 \n\t" + "vlef %%v0,24(%2),3 \n\t" + "vlef %%v16,28(%2),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v16,%%v16 \n\t" + "vfasb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v17,4(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,16(%%r1,%2),2 \n\t" + "vlef %%v17,20(%%r1,%2),2 \n\t" + "vlef %%v16,24(%%r1,%2),3 \n\t" + "vlef %%v17,28(%%r1,%2),3 \n\t" + + "vlef %%v18,32(%%r1,%2),0 \n\t" + "vlef %%v19,36(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),0 \n\t" + "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,48(%%r1,%2),2 \n\t" + "vlef %%v19,52(%%r1,%2),2 \n\t" + "vlef %%v18,56(%%r1,%2),3 \n\t" + "vlef %%v19,30(%%r1,%2),3 \n\t" + + "vlef %%v20,64(%%r1,%2),0 \n\t" + "vlef %%v21,68(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),0 \n\t" + "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,80(%%r1,%2),2 \n\t" + "vlef %%v21,84(%%r1,%2),2 \n\t" + "vlef %%v20,88(%%r1,%2),3 \n\t" + "vlef %%v21,92(%%r1,%2),3 \n\t" + + "vlef %%v22,96(%%r1,%2),0 \n\t" + "vlef %%v23,100(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),0 \n\t" + "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,112(%%r1,%2),2 \n\t" + "vlef %%v23,116(%%r1,%2),2 \n\t" + "vlef %%v22,120(%%r1,%2),3 \n\t" + "vlef %%v23,124(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vlef %%v16,128(%%r1,%2),0 \n\t" + "vlef %%v17,132(%%r1,%2),0 \n\t" + "vlef %%v16,136(%%r1,%2),0 \n\t" + "vlef %%v17,140(%%r1,%2),0 \n\t" + "vlef %%v16,144(%%r1,%2),2 \n\t" + "vlef %%v17,148(%%r1,%2),2 \n\t" + "vlef %%v16,152(%%r1,%2),3 \n\t" + "vlef %%v17,156(%%r1,%2),3 \n\t" + + "vlef %%v18,160(%%r1,%2),0 \n\t" + "vlef %%v19,164(%%r1,%2),0 \n\t" + "vlef %%v18,168(%%r1,%2),0 \n\t" + "vlef %%v19,172(%%r1,%2),0 \n\t" + "vlef %%v18,176(%%r1,%2),2 \n\t" + "vlef %%v19,180(%%r1,%2),2 \n\t" + "vlef %%v18,184(%%r1,%2),3 \n\t" + "vlef %%v19,188(%%r1,%2),3 \n\t" + + "vlef %%v20,192(%%r1,%2),0 \n\t" + "vlef %%v21,196(%%r1,%2),0 \n\t" + "vlef %%v20,200(%%r1,%2),0 \n\t" + "vlef %%v21,204(%%r1,%2),0 \n\t" + "vlef %%v20,208(%%r1,%2),2 \n\t" + "vlef %%v21,212(%%r1,%2),2 \n\t" + "vlef %%v20,216(%%r1,%2),3 \n\t" + "vlef %%v21,220(%%r1,%2),3 \n\t" + + "vlef %%v22,224(%%r1,%2),0 \n\t" + "vlef %%v23,228(%%r1,%2),0 \n\t" + "vlef %%v22,232(%%r1,%2),0 \n\t" + "vlef %%v23,236(%%r1,%2),0 \n\t" + "vlef %%v22,240(%%r1,%2),2 \n\t" + "vlef %%v23,244(%%r1,%2),2 \n\t" + "vlef %%v22,248(%%r1,%2),3 \n\t" + "vlef %%v23,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchsb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchsb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = camin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c new file mode 100644 index 000000000..f4ebc21bd --- /dev/null +++ b/kernel/zarch/casum.c @@ -0,0 +1,167 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + if ( inc_x == 1 ) + { + + n1 = n & -32; + if ( n1 > 0 ) + { + + sumf = casum_kernel_32(n1, x); + i=n1; + ip=2*n1; + } + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } + + } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } + + } + return(sumf); +} + + diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c new file mode 100644 index 000000000..2176f3dcd --- /dev/null +++ b/kernel/zarch/caxpy.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( +#if !defined(CONJ) + "vlrepf %%v0,0(%3) \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v1,4(%3),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%3),1 \n\t" + "vlef %%v1,4(%3),3 \n\t" +#else + "vlef %%v0,0(%3),1 \n\t" + "vlef %%v0,0(%3),3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v0,0(%3),2 \n\t" + "vlrepf %%v1,4(%3) \n\t" +#endif + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + + "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2]; + + if (n <= 0) return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { +#if !defined(CONJ) + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + i++; + ix += 2; + + } + return (0); + + + } + + inc_x *= 2; + inc_y *= 2; + + while (i < n) { + +#if !defined(CONJ) + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); +#else + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); +#endif + ix += inc_x; + iy += inc_y; + i++; + + } + return (0); + +} + + diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c new file mode 100644 index 000000000..fc0b8d648 --- /dev/null +++ b/kernel/zarch/ccopy.c @@ -0,0 +1,99 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,5 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + ccopy_kernel_32(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } + + + } + else + { + + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; + + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + + return(0); +} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c new file mode 100644 index 000000000..3eda2979b --- /dev/null +++ b/kernel/zarch/cdot.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + "verllg %%v22,%%v18,32 \n\t" + "verllg %%v23,%%v19,32 \n\t" + + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v24,%%v24,%%v28 \n\t" + "vfasb %%v24,%%v24,%%v30 \n\t" + "vrepg %%v26,%%v24,1 \n\t" + "vfasb %%v24,%%v24,%%v26 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vfasb %%v25,%%v25,%%v29 \n\t" + "vfasb %%v25,%%v25,%%v31 \n\t" + "vrepg %%v27,%%v25,1 \n\t" + "vfasb %%v25,%%v25,%%v27 \n\t" + "vstef %%v24,0(%3),0 \n\t" + "vstef %%v24,4(%3),1 \n\t" + "vstef %%v25,8(%3),1 \n\t" + "vstef %%v25,12(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); + + } + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + + if (n1) + cdot_kernel_16(n1, x, y, dot); + + i = n1; + BLASLONG j = i * 2; + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; + + } + + + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } + } + +#if !defined(CONJ) + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; +#else + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; + +#endif + + return (result); + +} + + diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c new file mode 100644 index 000000000..f04a624ac --- /dev/null +++ b/kernel/zarch/crot.c @@ -0,0 +1,256 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; + + } + + + } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c new file mode 100644 index 000000000..0c15c5add --- /dev/null +++ b/kernel/zarch/cscal.c @@ -0,0 +1,456 @@ +/*************************************************************************** +Copyright (c) 2013 - 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "vlef %%v1,4(%1),0 \n\t" + "vlef %%v1,4(%1),2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,4(%1),1 \n\t" + "vlef %%v1,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v24,%%v16,32 \n\t" + "verllg %%v25,%%v17,32 \n\t" + "verllg %%v26,%%v18,32 \n\t" + "verllg %%v27,%%v19,32 \n\t" + "verllg %%v28,%%v20,32 \n\t" + "verllg %%v29,%%v21,32 \n\t" + "verllg %%v30,%%v22,32 \n\t" + "verllg %%v31,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlef %%v0,4(%1),0 \n\t" + "vlef %%v0,4(%1),2 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,4(%1),1 \n\t" + "vlef %%v0,4(%1),3 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "verllg %%v16,%%v16,32 \n\t" + "verllg %%v17,%%v17,32 \n\t" + "verllg %%v18,%%v18,32 \n\t" + "verllg %%v19,%%v19,32 \n\t" + "verllg %%v20,%%v20,32 \n\t" + "verllg %%v21,%%v21,32 \n\t" + "verllg %%v22,%%v22,32 \n\t" + "verllg %%v23,%%v23,32 \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepf %%v0,0(%1) \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmsb %%v16,%%v16,%%v0 \n\t" + "vfmsb %%v17,%%v17,%%v0 \n\t" + "vfmsb %%v18,%%v18,%%v0 \n\t" + "vfmsb %%v19,%%v19,%%v0 \n\t" + "vfmsb %%v20,%%v20,%%v0 \n\t" + "vfmsb %%v21,%%v21,%%v0 \n\t" + "vfmsb %%v22,%%v22,%%v0 \n\t" + "vfmsb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) + { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); + + if (inc_x != 1) { + inc_x <<= 1; + + if (da_r == 0.0) { + + BLASLONG n1 = n & -2; + + if (da_i == 0.0) { + + while (j < n1) { + + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; + + } + + } else { + + while (j < n1) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + + + } + + } else { + + + if (da_i == 0.0) { + BLASLONG n1 = n & -2; + + while (j < n1) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + + } + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + + } + + } + + } + + return (0); + } + + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + alpha[0] = da_r; + alpha[1] = da_i; + + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else + if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } else { + + if (da_i == 0.0) { + + while (j < n) { + + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } + + } + + return (0); +} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c new file mode 100644 index 000000000..256995d50 --- /dev/null +++ b/kernel/zarch/cswap.c @@ -0,0 +1,183 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + cswap_kernel_32(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += 2 ; + iy += 2 ; + i++ ; + + + } + + + } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c new file mode 100644 index 000000000..b74af5d37 --- /dev/null +++ b/kernel/zarch/damax.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c new file mode 100644 index 000000000..4cf5e88b1 --- /dev/null +++ b/kernel/zarch/damin.c @@ -0,0 +1,206 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 7a42a0863..fea431c34 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,8 +23,7 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" #include @@ -35,80 +34,89 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ABS fabsf #endif - - - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum ; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_temp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_temp],256(%[ptr_temp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v2,%%v2,%%v26 \n\t" - "vfadb %%v3,%%v3,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v2,%%v2,%%v30 \n\t" - "vfadb %%v3,%%v3,%%v31 \n\t" - - "clgrjl %[ptr_temp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v2,%%v3 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum],%%f0 \n\t" - : [asum] "=f"(asum),[ptr_temp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; - +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; } - - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 16f82a587..e8823745e 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -25,98 +25,99 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" -#define PREFETCH_INS 1 -#if defined(Z13_A) -#include - -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) -{ - BLASLONG i = 0; - __vector double v_a = {alpha,alpha}; - __vector double * v_y=(__vector double *)y; - __vector double * v_x=(__vector double *)x; - - for(; i -#endif - -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double v_x2 = {x2,x2}; - __vector double v_x3 = {x3,x3}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; - } -} - -#else - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - - for ( i=0; i<4; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC - -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double v_x1 = {x1,x1}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%5) \n\t" + "vlrepg %%v1,8(%5) \n\t" + "vlrepg %%v2,16(%5) \n\t" + "vlrepg %%v3,24(%5) \n\t" + "vlrepg %%v4,%7 \n\t" + "vfmdb %%v0,%%v0,%%v4 \n\t" + "vfmdb %%v1,%%v1,%%v4 \n\t" + "vfmdb %%v2,%%v2,%%v4 \n\t" + "vfmdb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0,*a1; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap[0]; - a1 = ap[1]; - - for ( i=0; i<2; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0] + a1[i]*x[1]; - y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; - y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; - y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; - } -} - - -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0; - x0 = xo[0] * *alpha; - __vector double v_x0 = {x0,x0}; - __vector double* v_y =(__vector double*)y; - __vector double* va0 = (__vector double*)ap; - - for ( i=0; i< n/2; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } + __asm__ volatile ( + "vlrepg %%v0,0(%3) \n\t" + "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v2,%5 \n\t" + "vfmdb %%v0,%%v0,%%v2 \n\t" + "vfmdb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" - + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) { - BLASLONG i; - FLOAT *a0; - FLOAT x[4] __attribute__ ((aligned (16))); - a0 = ap; - - for ( i=0; i<1; i++) - x[i] = xo[i] * *alpha; - - for ( i=0; i< n; i+=4 ) - { - y[i] += a0[i]*x[0]; - y[i+1] += a0[i+1]*x[0]; - y[i+2] += a0[i+2]*x[0]; - y[i+3] += a0[i+3]*x[0]; - } + __asm__ volatile ( + "vlrepg %%v0,0(%2) \n\t" + "vlrepg %%v1,%4 \n\t" + "vfmdb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - - - static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { BLASLONG i; - - for ( i=0; i -#endif #define NBMAX 2048 -#ifdef HAVE_KERNEL_4x4 - -#elif HAVE_KERNEL_4x4_VEC - static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* va2 = (__vector double*)ap[2]; - __vector double* va3 = (__vector double*)ap[3]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - __vector double temp2 = {0,0}; - __vector double temp3 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; - y[2] = temp2[0] + temp2[1]; - y[3] = temp3[0] + temp3[1];; -} -#else -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; - temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - y[2] = temp2; - y[3] = temp3; + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "adbr %%f0,%%f4 \n\t" + "std %%f0,0(%6) \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "adbr %%f1,%%f4 \n\t" + "std %%f1,8(%6) \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "adbr %%f2,%%f4 \n\t" + "std %%f2,16(%6) \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "adbr %%f3,%%f4 \n\t" + "std %%f3,24(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - -#endif - -#ifdef HAVE_KERNEL_4x2 - -#elif HAVE_KERNEL_4x2_VEC static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)ap[0]; - __vector double* va1 = (__vector double*)ap[1]; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - __vector double temp1 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; - y[1] = temp1[0] + temp1[1]; -} -#else -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - - BLASLONG i; - FLOAT *a0,*a1; - a0 = ap[0]; - a1 = ap[1]; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - - for ( i=0; i< n; i+=4 ) - { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; - temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; - } - y[0] = temp0; - y[1] = temp1; - + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "adbr %%f0,%%f2 \n\t" + "std %%f0,0(%4) \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "adbr %%f1,%%f2 \n\t" + "std %%f1,8(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1 - -#elif HAVE_KERNEL_4x1_VEC static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - BLASLONG i; - __vector double* va0 = (__vector double*)a0; - __vector double* v_x =(__vector double*)x; - __vector double temp0 = {0,0}; - - for ( i=0; i< n/2; i+=2 ) - { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; - } - - y[0] = temp0[0] + temp0[1]; + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-16 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,4 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,12 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "std %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { BLASLONG i; - - - FLOAT temp0 = 0.0; - - for ( i=0; i< n; i+=4 ) + for (i = 0; i < n; i++) { - temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + dest[i] = *src; + src += inc_src; } - y[0] = temp0; } -#endif - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { - BLASLONG i; - for ( i=0; i 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c new file mode 100644 index 000000000..d7c86735f --- /dev/null +++ b/kernel/zarch/dmin.c @@ -0,0 +1,182 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index bf29538c7..c91f95800 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc", "r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,10 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -32; if ( n1 > 0 ) { - - drot_kernel_32(n1, x, y, c, s); + FLOAT cosa,sina; + cosa=c; + sina=s; + drot_kernel_32(n1, x, y, &cosa, &sina); i=n1; } @@ -229,6 +221,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -250,3 +243,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index e29f51012..ccc6dd95d 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -27,135 +27,75 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#ifdef Z13_A -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) -{ - - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "srlg %[n],%[n],4 \n\t" - "vlr %%v1,%%v0 \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "la %[x_ptr], 128(%[x_ptr]) \n\t" - "aghik %[n], %[n], -1 \n\t" - "jle 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v0 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" - "lay %[x_ptr], -128(%[x_ptr]) \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "brctg %[n],1b \n\t" - "2: \n\t" - "vfmdb %%v24, %%v16, %%v0 \n\t" - "vfmdb %%v25, %%v17, %%v1 \n\t" - "vfmdb %%v26, %%v18, %%v0 \n\t" - "vfmdb %%v27, %%v19, %%v1 \n\t" - "lay %[x_ptr] , -128(%[x_ptr]) \n\t" - "vfmdb %%v28, %%v20, %%v0 \n\t" - "vfmdb %%v29, %%v21, %%v1 \n\t" - "vfmdb %%v30, %%v22, %%v0 \n\t" - "vfmdb %%v31, %%v23, %%v1 \n\t" - "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) - : [alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - } -#else -static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__ volatile ( + "vlrepg %%v0,%1 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmdb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmdb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} - /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v0,%%r0,%%r0 \n\t" - "vlr %%v1,%%v0 \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v1 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v1 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v1 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v1 \n\t" - "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" - "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vfmdb %%v25,%%v25,%%v1 \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vfmdb %%v27,%%v27,%%v1 \n\t" - "vfmdb %%v28,%%v28,%%v0 \n\t" - "vfmdb %%v29,%%v29,%%v1 \n\t" - "vfmdb %%v30,%%v30,%%v0 \n\t" - "vfmdb %%v31,%%v31,%%v1 \n\t" - "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" - "la %[x_ptr], 256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da) - :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", - "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - } -#endif -static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { - - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "sllg %%r0,%[n],3 \n\t" - "vzero %%v25 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%[x_ptr]) \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vst %%v24, 32(%[x_ptr]) \n\t" - "vst %%v25, 48(%[x_ptr]) \n\t" - "vst %%v24, 64(%[x_ptr]) \n\t" - "vst %%v25, 80(%[x_ptr]) \n\t" - "vst %%v24, 96(%[x_ptr]) \n\t" - "vst %%v25, 112(%[x_ptr]) \n\t" - "vst %%v24, 128(%[x_ptr]) \n\t" - "vst %%v25, 144(%[x_ptr]) \n\t" - "vst %%v24, 160(%[x_ptr]) \n\t" - "vst %%v25, 176(%[x_ptr]) \n\t" - "vst %%v24, 192(%[x_ptr]) \n\t" - "vst %%v25, 208(%[x_ptr]) \n\t" - "vst %%v24, 224(%[x_ptr]) \n\t" - "vst %%v25, 240(%[x_ptr]) \n\t" - "la %[x_ptr],256(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" , "r0", "v24" ,"v25" - ); + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0,j=0; @@ -169,11 +109,11 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32_zero(n1 , x); + dscal_kernel_16_zero(n1, x); j=n1; } @@ -188,10 +128,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 > 0 ) { - dscal_kernel_32(n1 , da , x); + dscal_kernel_16(n1, da, x); j=n1; } while(j < n) @@ -260,4 +200,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS } return 0; -} \ No newline at end of file +} + + diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c new file mode 100644 index 000000000..17461a029 --- /dev/null +++ b/kernel/zarch/dsdot.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + double dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmsb %%v16,%%v16,%%v24 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmsb %%v17,%%v17,%%v25 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmsb %%v18,%%v18,%%v26 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmsb %%v19,%%v19,%%v27 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmsb %%v20,%%v20,%%v28 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmsb %%v21,%%v21,%%v29 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmsb %%v22,%%v22,%%v30 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmsb %%v23,%%v23,%%v31 \n\t" + + "vflls %%v24,%%v16 \n\t" + "vflls %%v25,%%v17 \n\t" + "vflls %%v26,%%v18 \n\t" + "vflls %%v27,%%v19 \n\t" + "vflls %%v28,%%v20 \n\t" + "vflls %%v29,%%v21 \n\t" + "vflls %%v30,%%v22 \n\t" + "vflls %%v31,%%v23 \n\t" + + "veslg %%v16,%%v16,32 \n\t" + "veslg %%v17,%%v17,32 \n\t" + "veslg %%v18,%%v18,32 \n\t" + "veslg %%v19,%%v19,32 \n\t" + "veslg %%v20,%%v20,32 \n\t" + "veslg %%v21,%%v21,32 \n\t" + "veslg %%v22,%%v22,32 \n\t" + "veslg %%v23,%%v23,32 \n\t" + + "vflls %%v16,%%v16 \n\t" + "vflls %%v17,%%v17 \n\t" + "vflls %%v18,%%v18 \n\t" + "vflls %%v19,%%v19 \n\t" + "vflls %%v20,%%v20 \n\t" + "vflls %%v21,%%v21 \n\t" + "vflls %%v22,%%v22 \n\t" + "vflls %%v23,%%v23 \n\t" + + "vfadb %%v16,%%v16,%%v24 \n\t" + "vfadb %%v17,%%v17,%%v25 \n\t" + "vfadb %%v18,%%v18,%%v26 \n\t" + "vfadb %%v19,%%v19,%%v27 \n\t" + "vfadb %%v20,%%v20,%%v28 \n\t" + "vfadb %%v21,%%v21,%%v29 \n\t" + "vfadb %%v22,%%v22,%%v30 \n\t" + "vfadb %%v23,%%v23,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v20 \n\t" + "vfadb %%v17,%%v17,%%v21 \n\t" + "vfadb %%v18,%%v18,%%v22 \n\t" + "vfadb %%v19,%%v19,%%v23 \n\t" + "vfadb %%v16,%%v16,%%v18 \n\t" + "vfadb %%v17,%%v17,%%v19 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v0,%%v16,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + double dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = dsdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index d7e079147..8070ef41a 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,217 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - - #include "common.h" - - -#if defined(Z13_SWAP_A) -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#else - -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],5 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[n])x), - [mem_y] "+m" (*(double (*)[n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; @@ -284,5 +160,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, } - - diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c new file mode 100644 index 000000000..e7f096e0d --- /dev/null +++ b/kernel/zarch/icamax.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = icamax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); + + } else { + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); + } +} + + diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c new file mode 100644 index 000000000..b9c1ccd9c --- /dev/null +++ b/kernel/zarch/icamin.c @@ -0,0 +1,319 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vlef %%v0,0(%3),0 \n\t" + "vlef %%v1,4(%3),0 \n\t" + "vlef %%v0,8(%3),1 \n\t" + "vlef %%v1,12(%3),1 \n\t" + "vlef %%v0,16(%3),2 \n\t" + "vlef %%v1,20(%3),2 \n\t" + "vlef %%v0,24(%3),3 \n\t" + "vlef %%v1,28(%3),3 \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vflpsb %%v1,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,16 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%3),0 \n\t" + "vlef %%v17,4(%%r1,%3),0 \n\t" + "vlef %%v16,8(%%r1,%3),1 \n\t" + "vlef %%v17,12(%%r1,%3),1 \n\t" + "vlef %%v16,16(%%r1,%3),2 \n\t" + "vlef %%v17,20(%%r1,%3),2 \n\t" + "vlef %%v16,24(%%r1,%3),3 \n\t" + "vlef %%v17,28(%%r1,%3),3 \n\t" + + "vlef %%v18,32(%%r1,%3),0 \n\t" + "vlef %%v19,36(%%r1,%3),0 \n\t" + "vlef %%v18,40(%%r1,%3),1 \n\t" + "vlef %%v19,44(%%r1,%3),1 \n\t" + "vlef %%v18,48(%%r1,%3),2 \n\t" + "vlef %%v19,52(%%r1,%3),2 \n\t" + "vlef %%v18,56(%%r1,%3),3 \n\t" + "vlef %%v19,30(%%r1,%3),3 \n\t" + + "vlef %%v20,64(%%r1,%3),0 \n\t" + "vlef %%v21,68(%%r1,%3),0 \n\t" + "vlef %%v20,72(%%r1,%3),1 \n\t" + "vlef %%v21,76(%%r1,%3),1 \n\t" + "vlef %%v20,80(%%r1,%3),2 \n\t" + "vlef %%v21,84(%%r1,%3),2 \n\t" + "vlef %%v20,88(%%r1,%3),3 \n\t" + "vlef %%v21,92(%%r1,%3),3 \n\t" + + "vlef %%v22,96(%%r1,%3),0 \n\t" + "vlef %%v23,100(%%r1,%3),0 \n\t" + "vlef %%v22,104(%%r1,%3),1 \n\t" + "vlef %%v23,108(%%r1,%3),1 \n\t" + "vlef %%v22,112(%%r1,%3),2 \n\t" + "vlef %%v23,116(%%r1,%3),2 \n\t" + "vlef %%v22,120(%%r1,%3),3 \n\t" + "vlef %%v23,124(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vlef %%v16,128(%%r1,%3),0 \n\t" + "vlef %%v17,132(%%r1,%3),0 \n\t" + "vlef %%v16,136(%%r1,%3),1 \n\t" + "vlef %%v17,140(%%r1,%3),1 \n\t" + "vlef %%v16,144(%%r1,%3),2 \n\t" + "vlef %%v17,148(%%r1,%3),2 \n\t" + "vlef %%v16,152(%%r1,%3),3 \n\t" + "vlef %%v17,156(%%r1,%3),3 \n\t" + + "vlef %%v18,160(%%r1,%3),0 \n\t" + "vlef %%v19,164(%%r1,%3),0 \n\t" + "vlef %%v18,168(%%r1,%3),1 \n\t" + "vlef %%v19,172(%%r1,%3),1 \n\t" + "vlef %%v18,176(%%r1,%3),2 \n\t" + "vlef %%v19,180(%%r1,%3),2 \n\t" + "vlef %%v18,184(%%r1,%3),3 \n\t" + "vlef %%v19,188(%%r1,%3),3 \n\t" + + "vlef %%v20,192(%%r1,%3),0 \n\t" + "vlef %%v21,196(%%r1,%3),0 \n\t" + "vlef %%v20,200(%%r1,%3),1 \n\t" + "vlef %%v21,204(%%r1,%3),1 \n\t" + "vlef %%v20,208(%%r1,%3),2 \n\t" + "vlef %%v21,212(%%r1,%3),2 \n\t" + "vlef %%v20,216(%%r1,%3),3 \n\t" + "vlef %%v21,220(%%r1,%3),3 \n\t" + + "vlef %%v22,224(%%r1,%3),0 \n\t" + "vlef %%v23,228(%%r1,%3),0 \n\t" + "vlef %%v22,232(%%r1,%3),1 \n\t" + "vlef %%v23,236(%%r1,%3),1 \n\t" + "vlef %%v22,240(%%r1,%3),2 \n\t" + "vlef %%v23,244(%%r1,%3),2 \n\t" + "vlef %%v22,248(%%r1,%3),3 \n\t" + "vlef %%v23,252(%%r1,%3),3 \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" + "vfasb %%v17,%%v18,%%v19 \n\t" + "vfasb %%v18,%%v20,%%v21 \n\t" + "vfasb %%v19,%%v22,%%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = icamin_kernel_32(n1, x, &minf); + + i = n1; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); + } +} + + diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b67091148..aba880949 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -23,164 +23,173 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -/** - * Find maximum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vzero %%v5 \n\t" - "vzero %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "vfchdb %%v16,%%v25,%%v24 \n\t " - "vfchdb %%v17,%%v27,%%v26 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v29,%%v28 \n\t " - "vfchdb %%v17,%%v31,%%v30 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - "vfchdb %%v28, %%v3,%%v0 \n\t" - "vfchdb %%v29,%%v27, %%v25 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - "vfchdb %%v16,%%v25 , %%v0 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - "vfchdb %%v17, %%v29,%%v18 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v26,%%v18 \n\t" - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[maxf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return index; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; -} + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG max = 0; @@ -191,7 +200,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG n1 = n & -32; if (n1 > 0) { - max = diamax_kernel_32_TUNED(n1, x, &maxf); + max = idamax_kernel_32(n1, x, &maxf); i = n1; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 8a7ff1659..3213efa4d 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -23,192 +23,185 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ + #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif -/** - * Find minimum index - * Warning: requirements n>0 and n % 32 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],3 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vleig %%v20,0,0 \n\t" - "vleig %%v20,1,1 \n\t" - "vleig %%v21,2,0 \n\t" - "vleig %%v21,3,1 \n\t" - "vleig %%v22,4,0 \n\t" - "vleig %%v22,5,1 \n\t" - "vleig %%v23,6,0 \n\t" - "vleig %%v23,7,1 \n\t" - "vrepig %%v4,8 \n\t" - "vlrepg %%v18,0(%[ptr_x]) \n\t" - "vzero %%v5 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vzero %%v19 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t " - "vfchdb %%v17,%%v26 ,%%v27 \n\t " - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28, %%v29 \n\t " - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfchdb %%v16,%%v24,%%v25 \n\t" - "vfchdb %%v17,%%v26 ,%%v27 \n\t" - "vsel %%v1,%%v21,%%v20,%%v16 \n\t" - "vsel %%v0,%%v25,%%v24,%%v16 \n\t" - "vsel %%v2,%%v23,%%v22,%%v17 \n\t" - "vsel %%v3,%%v27,%%v26,%%v17 \n\t" - "vfchdb %%v16,%%v28 ,%%v29 \n\t" - "vfchdb %%v17,%%v30,%%v31 \n\t" - "vsel %%v24,%%v21,%%v20,%%v16 \n\t" - "vsel %%v25,%%v29,%%v28,%%v16 \n\t" - "vsel %%v26,%%v23,%%v22,%%v17 \n\t" - "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - - - "vfchdb %%v28,%%v0 , %%v3 \n\t" - "vfchdb %%v29, %%v25,%%v27 \n\t" - "vsel %%v1,%%v2,%%v1,%%v28 \n\t" - "vsel %%v0,%%v3,%%v0,%%v28 \n\t" - "vsel %%v24,%%v26,%%v24,%%v29 \n\t" - "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - - "vag %%v1,%%v1,%%v5 \n\t" - "vag %%v24,%%v24,%%v5 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vag %%v24,%%v24,%%v4 \n\t" - - "vfchdb %%v16, %%v0,%%v25 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vsel %%v29,%%v25,%%v0,%%v16 \n\t" - "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - - "vfchdb %%v17,%%v18, %%v29 \n\t" - "vsel %%v19,%%v28,%%v19,%%v17 \n\t" - "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - - "vrepg %%v26,%%v18,1 \n\t" - "vrepg %%v5,%%v19,1 \n\t" - "wfcdb %%v26,%%v18 \n\t" - "jne 2f \n\t" - "vsteg %%v18,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v19 \n\t" - "j 3f \n\t" - - "2: \n\t" - "wfchdb %%v16,%%v18 ,%%v26 \n\t " - "vsel %%v1,%%v5,%%v19,%%v16 \n\t" - "vsel %%v0,%%v26,%%v18,%%v16 \n\t" - "std %%f0,%[minf] \n\t" - - "3: \n\t" - "vlgvg %[index],%%v1,0 \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; - +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; } - - - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; BLASLONG j = 0; - BLASLONG ix = 0; - BLASLONG min = 0; FLOAT minf = 0.0; - + BLASLONG min = 0; + if (n <= 0 || inc_x <= 0) return (min); - minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { BLASLONG n1 = n & -32; if (n1 > 0) { - min = diamin_kernel_32(n1, x, &minf); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c new file mode 100644 index 000000000..26fff4eb0 --- /dev/null +++ b/kernel/zarch/idmax.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchdb %%v6,%%v20,%%v21 \n\t" + "vfchdb %%v7,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v16,%%v17 \n\t" + "vfchdb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + max = idmax_kernel_32(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c new file mode 100644 index 000000000..570b33a15 --- /dev/null +++ b/kernel/zarch/idmin.c @@ -0,0 +1,232 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,16 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "vleig %%v28,8,0 \n\t" + "vleig %%v28,9,1 \n\t" + "vleig %%v29,10,0 \n\t" + "vleig %%v29,11,1 \n\t" + "vleig %%v30,12,0 \n\t" + "vleig %%v30,13,1 \n\t" + "vleig %%v31,14,0 \n\t" + "vleig %%v31,15,1 \n\t" + "srlg %%r0,%2,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchdb %%v6,%%v21,%%v20 \n\t" + "vfchdb %%v7,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + "vsel %%v18,%%v20,%%v21,%%v6 \n\t" + "vsel %%v6,%%v28,%%v29,%%v6 \n\t" + "vsel %%v19,%%v22,%%v23,%%v7 \n\t" + "vsel %%v7,%%v30,%%v31,%%v7 \n\t" + + "vfchdb %%v20,%%v17,%%v16 \n\t" + "vfchdb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v4,%%v4,%%v5,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v5,%%v6,%%v7,%%v21 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = idmin_kernel_32(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c new file mode 100644 index 000000000..95a665b10 --- /dev/null +++ b/kernel/zarch/isamax.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = isamax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c new file mode 100644 index 000000000..640fc02c9 --- /dev/null +++ b/kernel/zarch/isamin.c @@ -0,0 +1,299 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return iamin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = isamin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c new file mode 100644 index 000000000..0eb350315 --- /dev/null +++ b/kernel/zarch/ismax.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) +{ + BLASLONG imax; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v16,%%v17 \n\t" + "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchsb %%v7,%%v20,%%v21 \n\t" + "vfchsb %%v8,%%v22,%%v23 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v16,%%v17 \n\t" + "vfchsb %%v21,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v0,%%v3 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imax),"=m"(*max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imax; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) return (max); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + max = ismax_kernel_64(n1, x, &maxf); + + i = n1; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; + } + return (max + 1); + } +} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c new file mode 100644 index 000000000..f050db8cb --- /dev/null +++ b/kernel/zarch/ismin.c @@ -0,0 +1,275 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) +{ + BLASLONG imin; + + __asm__ volatile ( + "vl %%v0,0(%3) \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,2,1 \n\t" + "vleig %%v2,1,0 \n\t" + "vleig %%v2,3,1 \n\t" + "vrepig %%v3,32 \n\t" + "vzero %%v4 \n\t" + "vleif %%v24,0,0 \n\t" + "vleif %%v24,1,1 \n\t" + "vleif %%v24,2,2 \n\t" + "vleif %%v24,3,3 \n\t" + "vleif %%v25,4,0 \n\t" + "vleif %%v25,5,1 \n\t" + "vleif %%v25,6,2 \n\t" + "vleif %%v25,7,3 \n\t" + "vleif %%v26,8,0 \n\t" + "vleif %%v26,9,1 \n\t" + "vleif %%v26,10,2 \n\t" + "vleif %%v26,11,3 \n\t" + "vleif %%v27,12,0 \n\t" + "vleif %%v27,13,1 \n\t" + "vleif %%v27,14,2 \n\t" + "vleif %%v27,15,3 \n\t" + "vleif %%v28,16,0 \n\t" + "vleif %%v28,17,1 \n\t" + "vleif %%v28,18,2 \n\t" + "vleif %%v28,19,3 \n\t" + "vleif %%v29,20,0 \n\t" + "vleif %%v29,21,1 \n\t" + "vleif %%v29,22,2 \n\t" + "vleif %%v29,23,3 \n\t" + "vleif %%v30,24,0 \n\t" + "vleif %%v30,25,1 \n\t" + "vleif %%v30,26,2 \n\t" + "vleif %%v30,27,3 \n\t" + "vleif %%v31,28,0 \n\t" + "vleif %%v31,29,1 \n\t" + "vleif %%v31,30,2 \n\t" + "vleif %%v31,31,3 \n\t" + "srlg %%r0,%2,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v17,144(%%r1,%3) \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v19,176(%%r1,%3) \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v21,208(%%r1,%3) \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v23,240(%%r1,%3) \n\t" + + "vfchsb %%v5,%%v17,%%v16 \n\t" + "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchsb %%v7,%%v21,%%v20 \n\t" + "vfchsb %%v8,%%v23,%%v22 \n\t" + "vsel %%v16,%%v16,%%v17,%%v5 \n\t" + "vsel %%v5,%%v24,%%v25,%%v5 \n\t" + "vsel %%v17,%%v18,%%v19,%%v6 \n\t" + "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + "vsel %%v18,%%v20,%%v21,%%v7 \n\t" + "vsel %%v7,%%v28,%%v29,%%v7 \n\t" + "vsel %%v19,%%v22,%%v23,%%v8 \n\t" + "vsel %%v8,%%v30,%%v31,%%v8 \n\t" + + "vfchsb %%v20,%%v17,%%v16 \n\t" + "vfchsb %%v21,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v20 \n\t" + "vsel %%v5,%%v5,%%v6,%%v20 \n\t" + "vsel %%v17,%%v18,%%v19,%%v21 \n\t" + "vsel %%v6,%%v7,%%v8,%%v21 \n\t" + + "vfchsb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v5,%%v5,%%v6,%%v18 \n\t" + "vsegf %%v6,%%v5 \n\t" + "vesrlg %%v5,%%v5,32 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vag %%v6,%%v6,%%v4 \n\t" + + "vfchsb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vsegf %%v8,%%v7 \n\t" + "vesrlg %%v7,%%v7,32 \n\t" + "vsegf %%v7,%%v7 \n\t" + "vsel %%v1,%%v5,%%v1,%%v7 \n\t" + "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v3,%%v0,32 \n\t" + "vfchsb %%v4,%%v3,%%v0 \n\t" + "vchlg %%v5,%%v2,%%v1 \n\t" + "vfcesb %%v6,%%v0,%%v3 \n\t" + "vn %%v5,%%v5,%%v6 \n\t" + "vo %%v4,%%v4,%%v5 \n\t" + "vsel %%v0,%%v0,%%v3,%%v4 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" + "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + + "vrepf %%v2,%%v0,2 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcsb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vstef %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchsb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "ste %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(imin),"=m"(*min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return imin; +} + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) return (min); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + min = ismin_kernel_64(n1, x, &minf); + + i = n1; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; + } + return (min + 1); + } +} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 216c3414a..bf5f621a7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -24,190 +24,165 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - - - -/** - * Find maximum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param maxf (out) maximum absolute value .( only for output ) - * @return index - */ -static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { - BLASLONG index; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - - - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v6 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) +{ + BLASLONG iamax; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfchdb %%v25,%%v1,%%v0 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v3,%%v2 \n\t " - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v26,%%v24 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v30,%%v28 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24, %%v1,%%v31 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30, %%v27,%%v3 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0, %%v31,%%v28 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30, %%v27,%%v6 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[maxf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3 \n\t" - "2: \n\t" - "wfchdb %%v16,%%v26,%%v6 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[maxf] \n\t" - "3: \n\t" - : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - return index; - + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v16,%%v17 \n\t" + "vfchdb %%v5,%%v18,%%v19 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v16,%%v17 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v2,%%v0 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamax),"=m"(*amax) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamax; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; @@ -223,9 +198,9 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) BLASLONG n1 = n & -16; if (n1 > 0) { - max = ziamax_kernel_16_TUNED(n1, x, &maxf); + max = izamax_kernel_16(n1, x, &maxf); + i = n1; - ix = n1 << 1; } while(i < n) @@ -260,7 +235,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } return (max + 1); } - } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9b2a653a7..3636e8fdf 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -24,253 +24,217 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include -#define ABS fabs -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) - - -/** - * Find minimum index - * Warning: requirements n>0 and n % 16 == 0 - * @param n - * @param x pointer to the vector - * @param minf (out) minimum absolute value .( only for output ) - * @return minimum index - */ -static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { - BLASLONG index ; - __asm__( - "pfd 1, 0(%[ptr_x]) \n\t" - "vleig %%v16,0,0 \n\t" - "vleig %%v16,1,1 \n\t" - "vleig %%v17,2,0 \n\t" - "vleig %%v17,3,1 \n\t" - "vleig %%v18,4,0 \n\t" - "vleig %%v18,5,1 \n\t" - "vleig %%v19,6,0 \n\t" - "vleig %%v19,7,1 \n\t" - "vleig %%v20,8,0 \n\t" - "vleig %%v20,9,1 \n\t" - "vleig %%v21,10,0 \n\t" - "vleig %%v21,11,1 \n\t" - "vleig %%v22,12,0 \n\t" - "vleig %%v22,13,1 \n\t" - "vleig %%v23,14,0 \n\t" - "vleig %%v23,15,1 \n\t" - "ld %%f6,0(%[ptr_x]) \n\t" - "lpdbr %%f6,%%f6 \n\t" - "ld %%f7,8(%[ptr_x]) \n\t" - "lpdbr %%f7,%%f7 \n\t" - "adbr %%f6,%%f7 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vrepg %%v6,%%v6,0 \n\t" - "vzero %%v7 \n\t" - "vrepig %%v4,16 \n\t" - "vzero %%v5 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) +{ + BLASLONG iamin; + + __asm__ volatile ( + "vleg %%v0,0(%3),0 \n\t" + "vleg %%v1,8(%3),0 \n\t" + "vleg %%v0,16(%3),1 \n\t" + "vleg %%v1,24(%3),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v1,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vleig %%v1,0,0 \n\t" + "vleig %%v1,1,1 \n\t" + "vrepig %%v2,8 \n\t" + "vzero %%v3 \n\t" + "vleig %%v24,0,0 \n\t" + "vleig %%v24,1,1 \n\t" + "vleig %%v25,2,0 \n\t" + "vleig %%v25,3,1 \n\t" + "vleig %%v26,4,0 \n\t" + "vleig %%v26,5,1 \n\t" + "vleig %%v27,6,0 \n\t" + "vleig %%v27,7,1 \n\t" + "srlg %%r0,%2,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" + + "vleg %%v16,0(%%r1,%3),0 \n\t" + "vleg %%v17,8(%%r1,%3),0 \n\t" + "vleg %%v16,16(%%r1,%3),1 \n\t" + "vleg %%v17,24(%%r1,%3),1 \n\t" + "vleg %%v18,32(%%r1,%3),0 \n\t" + "vleg %%v19,40(%%r1,%3),0 \n\t" + "vleg %%v18,48(%%r1,%3),1 \n\t" + "vleg %%v19,56(%%r1,%3),1 \n\t" + "vleg %%v20,64(%%r1,%3),0 \n\t" + "vleg %%v21,72(%%r1,%3),0 \n\t" + "vleg %%v20,80(%%r1,%3),1 \n\t" + "vleg %%v21,88(%%r1,%3),1 \n\t" + "vleg %%v22,96(%%r1,%3),0 \n\t" + "vleg %%v23,104(%%r1,%3),0 \n\t" + "vleg %%v22,112(%%r1,%3),1 \n\t" + "vleg %%v23,120(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" - "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" - "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" - "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" - "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" - "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" - "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" - "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" - "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" - "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" - "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" - "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" - "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" - "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v24,%%v25 \n\t" - "vfadb %%v1,%%v26,%%v27 \n\t" - "vfadb %%v2,%%v28,%%v29 \n\t" - "vfadb %%v3,%%v30,%%v31 \n\t" - - - "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" - "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" - "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" - "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" - "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" - "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" - "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" - "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" - "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" - "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" - "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" - "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" - "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" - "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" - "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" - "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - - "vfchdb %%v25,%%v0 ,%%v1 \n\t" - "vsel %%v29,%%v17,%%v16,%%v25 \n\t" - "vsel %%v31,%%v1,%%v0,%%v25 \n\t" - - "vfchdb %%v27,%%v2,%%v3 \n\t" - "vsel %%v0,%%v19,%%v18,%%v27 \n\t" - "vsel %%v1,%%v3,%%v2,%%v27 \n\t" - - "vfchdb %%v25,%%v24,%%v26 \n\t" - "vsel %%v2,%%v21,%%v20,%%v25 \n\t" - "vsel %%v3,%%v26,%%v24,%%v25 \n\t" - - "vfchdb %%v27,%%v28,%%v30 \n\t" - "vsel %%v25,%%v23,%%v22,%%v27 \n\t" - "vsel %%v27,%%v30,%%v28,%%v27 \n\t" - - "vfchdb %%v24,%%v31, %%v1 \n\t" - "vsel %%v26,%%v0,%%v29,%%v24 \n\t" - "vsel %%v28,%%v1,%%v31,%%v24 \n\t" - - "vfchdb %%v30,%%v3, %%v27 \n\t" - "vsel %%v29,%%v25,%%v2,%%v30 \n\t" - "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" - - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - - "vfchdb %%v0,%%v28, %%v31 \n\t" - "vsel %%v25,%%v29,%%v26,%%v0 \n\t" - "vsel %%v27,%%v31,%%v28,%%v0 \n\t" - - "vag %%v25,%%v25,%%v5 \n\t" - - //cmp with previous - "vfchdb %%v30,%%v6 , %%v27 \n\t" - "vsel %%v7,%%v25,%%v7,%%v30 \n\t" - "vsel %%v6,%%v27,%%v6,%%v30 \n\t" - - "vag %%v5,%%v5,%%v4 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - - //xtract index - "vrepg %%v26,%%v6,1 \n\t" - "vrepg %%v5,%%v7,1 \n\t" - "wfcdb %%v26,%%v6 \n\t" - "jne 2f \n\t" - "vsteg %%v6,%[minf],0 \n\t" - "vmnlg %%v1,%%v5,%%v7 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "j 3f \n\t" - "2: \n\t" - "wfchdb %%v16,%%v6 ,%%v26 \n\t" - "vsel %%v1,%%v5,%%v7,%%v16 \n\t" - "vsel %%v0,%%v26,%%v6,%%v16 \n\t" - "vlgvg %[index],%%v1,0 \n\t" - "std %%f0,%[minf] \n\t" - "3: \n\t" - - : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) - : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - - ); - - return index; + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "vleg %%v16,128(%%r1,%3),0 \n\t" + "vleg %%v17,136(%%r1,%3),0 \n\t" + "vleg %%v16,144(%%r1,%3),1 \n\t" + "vleg %%v17,152(%%r1,%3),1 \n\t" + "vleg %%v18,160(%%r1,%3),0 \n\t" + "vleg %%v19,168(%%r1,%3),0 \n\t" + "vleg %%v18,176(%%r1,%3),1 \n\t" + "vleg %%v19,184(%%r1,%3),1 \n\t" + "vleg %%v20,192(%%r1,%3),0 \n\t" + "vleg %%v21,200(%%r1,%3),0 \n\t" + "vleg %%v20,208(%%r1,%3),1 \n\t" + "vleg %%v21,216(%%r1,%3),1 \n\t" + "vleg %%v22,224(%%r1,%3),0 \n\t" + "vleg %%v23,232(%%r1,%3),0 \n\t" + "vleg %%v22,240(%%r1,%3),1 \n\t" + "vleg %%v23,248(%%r1,%3),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v4,%%v17,%%v16 \n\t" + "vfchdb %%v5,%%v19,%%v18 \n\t" + "vsel %%v16,%%v16,%%v17,%%v4 \n\t" + "vsel %%v4,%%v24,%%v25,%%v4 \n\t" + "vsel %%v17,%%v18,%%v19,%%v5 \n\t" + "vsel %%v5,%%v26,%%v27,%%v5 \n\t" + + "vfchdb %%v18,%%v17,%%v16 \n\t" + "vsel %%v16,%%v16,%%v17,%%v18 \n\t" + "vsel %%v4,%%v4,%%v5,%%v18 \n\t" + "vag %%v4,%%v4,%%v3 \n\t" + + "vfchdb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v5 \n\t" + "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vag %%v3,%%v3,%%v2 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v2,%%v0,1 \n\t" + "vrepg %%v3,%%v1,1 \n\t" + "wfcdb %%v2,%%v0 \n\t" + "jne 1f \n\t" + "vsteg %%v0,%1,0 \n\t" + "vmnlg %%v0,%%v1,%%v3 \n\t" + "vlgvg %0,%%v0,0 \n\t" + "j 2f \n\t" + "1: \n\t" + "wfchdb %%v4,%%v0,%%v2 \n\t" + "vsel %%v1,%%v3,%%v1,%%v4 \n\t" + "vsel %%v0,%%v2,%%v0,%%v4 \n\t" + "vlgvg %0,%%v1,0 \n\t" + "std %%f0,%1 \n\t" + "2: \n\t" + "nop " + :"=r"(iamin),"=m"(*amin) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return iamin; } - - - - - BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; BLASLONG inc_x2; if (n <= 0 || inc_x <= 0) return(min); - - + if (inc_x == 1) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + BLASLONG n1 = n & -16; + if (n1 > 0) { + + min = izamin_kernel_16(n1, x, &minf); - min = ziamin_kernel_16_TUNED(n1, x, &minf); i = n1; - ix = n1 << 1; - } - else { - //assign minf - minf = CABS1(x,0); - ix += 2; - i++; - } + } - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += 2; + i++; + } return (min + 1); } else { - inc_x2 = 2 * inc_x; + inc_x2 = 2 * inc_x; - minf = CABS1(x,0); - ix += inc_x2; - i++; + minf = CABS1(x,0); + ix += inc_x2; + i++; - while(i < n) + while(i < n) + { + if( CABS1(x,ix) < minf ) { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + min = i; + minf = CABS1(x,ix); } + ix += inc_x2; + i++; + } return (min + 1); } - } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c new file mode 100644 index 000000000..1025cfcbf --- /dev/null +++ b/kernel/zarch/samax.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v16,%%v17 \n\t" + "vfchsb %%v25,%%v18,%%v19 \n\t" + "vfchsb %%v26,%%v20,%%v21 \n\t" + "vfchsb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v24,%%v25 \n\t" + "vfchsb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + maxf = samax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c new file mode 100644 index 000000000..3b8f03e6a --- /dev/null +++ b/kernel/zarch/samin.c @@ -0,0 +1,210 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpsb %%v0,%%v0 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = samin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c new file mode 100644 index 000000000..2c59ab2e5 --- /dev/null +++ b/kernel/zarch/sasum.c @@ -0,0 +1,174 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT asum; + + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpsb %%v16, %%v16 \n\t" + "vflpsb %%v17, %%v17 \n\t" + "vflpsb %%v18, %%v18 \n\t" + "vflpsb %%v19, %%v19 \n\t" + "vflpsb %%v20, %%v20 \n\t" + "vflpsb %%v21, %%v21 \n\t" + "vflpsb %%v22, %%v22 \n\t" + "vflpsb %%v23, %%v23 \n\t" + + "vfasb %%v0,%%v0,%%v16 \n\t" + "vfasb %%v1,%%v1,%%v17 \n\t" + "vfasb %%v2,%%v2,%%v18 \n\t" + "vfasb %%v3,%%v3,%%v19 \n\t" + "vfasb %%v0,%%v0,%%v20 \n\t" + "vfasb %%v1,%%v1,%%v21 \n\t" + "vfasb %%v2,%%v2,%%v22 \n\t" + "vfasb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vfasb %%v0,%%v0,%%v3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "ler %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + + return asum; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + + if (n <= 0 || inc_x <= 0) return sumf; + + if (inc_x == 1) { + + n1 = n & -64; + + if (n1 > 0) { + + sumf = sasum_kernel_64(n1, x); + i = n1; + } + + while (i < n) { + sumf += ABS(x[i]); + i++; + } + + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { + + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + + i += inc_x * 4; + j += 4; + + } + sumf = sum1 + sum2; + while (j < n) { + + sumf += ABS(x[i]); + i += inc_x; + j++; + } + + + } + return sumf; +} + + diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c new file mode 100644 index 000000000..26ead310c --- /dev/null +++ b/kernel/zarch/saxpy.c @@ -0,0 +1,184 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( + "vlrepf %%v0,%3 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,80(%%r1,%1) \n\t" + "vl %%v26,96(%%r1,%1) \n\t" + "vl %%v27,112(%%r1,%1) \n\t" + "vl %%v28,64(%%r1,%2) \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vl %%v30,96(%%r1,%2) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "vl %%v16,128(%%r1,%1) \n\t" + "vl %%v17,144(%%r1,%1) \n\t" + "vl %%v18,160(%%r1,%1) \n\t" + "vl %%v19,176(%%r1,%1) \n\t" + "vl %%v20,128(%%r1,%2) \n\t" + "vl %%v21,144(%%r1,%2) \n\t" + "vl %%v22,160(%%r1,%2) \n\t" + "vl %%v23,176(%%r1,%2) \n\t" + + "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" + + "vl %%v24,192(%%r1,%1) \n\t" + "vl %%v25,208(%%r1,%1) \n\t" + "vl %%v26,224(%%r1,%1) \n\t" + "vl %%v27,240(%%r1,%1) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" + + "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" + "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" + "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" + "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" + + "vst %%v16,128(%%r1,%2) \n\t" + "vst %%v17,144(%%r1,%2) \n\t" + "vst %%v18,160(%%r1,%2) \n\t" + "vst %%v19,176(%%r1,%2) \n\t" + "vst %%v20,192(%%r1,%2) \n\t" + "vst %%v21,208(%%r1,%2) \n\t" + "vst %%v22,224(%%r1,%2) \n\t" + "vst %%v23,240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return 0 ; + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y , &da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return 0 ; + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return 0 ; + +} + + diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c new file mode 100644 index 000000000..ff4227595 --- /dev/null +++ b/kernel/zarch/scopy.c @@ -0,0 +1,85 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,6 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","r2" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + + if (n <= 0) return 0; + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + + while (i < n) { + y[i] = x[i]; + i++; + + } + + + } else { + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } + + } + return 0; + + +} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c new file mode 100644 index 000000000..fd8c8e445 --- /dev/null +++ b/kernel/zarch/sdot.c @@ -0,0 +1,140 @@ +/*************************************************************************** +Copyright (c) 2013-2018,The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms,with or without +modification,are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice,this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice,this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL +DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + + __asm__ volatile ( + "vzero %%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,16(%%r1,%3) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + "vl %%v27,48(%%r1,%3) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + "vl %%v28,64(%%r1,%3) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%3) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + "vl %%v30,96(%%r1,%3) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "vrepf %%v3,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "aebr %%f0,%%f2 \n\t" + "aebr %%f0,%%f3 \n\t" + "ler %0,%%f0 " + :"=f"(dot) + :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return dot; +} + +FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT dot = 0.0 ; + + if ( n <= 0 ) return(dot); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -32; + + if ( n1 ) + dot = sdot_kernel_32(n1,x,y); + + i = n1; + while(i < n) + { + + dot += y[i] * x[i] ; + i++ ; + + } + return(dot); + + + } + + BLASLONG n1 = n & -2; + + while(i < n1) + { + + dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + ix += inc_x*2 ; + iy += inc_y*2 ; + i+=2 ; + + } + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); + +} + + diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c new file mode 100644 index 000000000..92019d732 --- /dev/null +++ b/kernel/zarch/sgemv_n_4.c @@ -0,0 +1,668 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%5) \n\t" + "vlrepf %%v1,4(%5) \n\t" + "vlrepf %%v2,8(%5) \n\t" + "vlrepf %%v3,12(%5) \n\t" + "vlrepf %%v4,%7 \n\t" + "vfmsb %%v0,%%v0,%%v4 \n\t" + "vfmsb %%v1,%%v1,%%v4 \n\t" + "vfmsb %%v2,%%v2,%%v4 \n\t" + "vfmsb %%v3,%%v3,%%v4 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + "vl %%v20,16(%%r1,%1) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,16(%%r1,%3) \n\t" + "vl %%v23,16(%%r1,%4) \n\t" + "vl %%v24,32(%%r1,%1) \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vl %%v28,48(%%r1,%1) \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "vl %%v4,16(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,16(%%r1,%6) \n\t" + + "vl %%v4,32(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,32(%%r1,%6) \n\t" + + "vl %%v4,48(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,48(%%r1,%6) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,64(%%r1,%2) \n\t" + "vl %%v18,64(%%r1,%3) \n\t" + "vl %%v19,64(%%r1,%4) \n\t" + "vl %%v20,80(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,80(%%r1,%3) \n\t" + "vl %%v23,80(%%r1,%4) \n\t" + "vl %%v24,96(%%r1,%1) \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vl %%v28,112(%%r1,%1) \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + + "vl %%v4,64(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,64(%%r1,%6) \n\t" + + "vl %%v4,80(%%r1,%6) \n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" + "vst %%v4,80(%%r1,%6) \n\t" + + "vl %%v4,96(%%r1,%6) \n\t" + "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" + "vst %%v4,96(%%r1,%6) \n\t" + + "vl %%v4,112(%%r1,%6) \n\t" + "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" + "vst %%v4,112(%%r1,%6) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,0(%%r1,%3) \n\t" + "vl %%v19,0(%%r1,%4) \n\t" + + "vl %%v4,0(%%r1,%6) \n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" + "vst %%v4,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%3) \n\t" + "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v2,%5 \n\t" + "vfmsb %%v0,%%v0,%%v2 \n\t" + "vfmsb %%v1,%%v1,%%v2 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + "vl %%v18,16(%%r1,%1) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "vl %%v20,32(%%r1,%1) \n\t" + "vl %%v21,32(%%r1,%2) \n\t" + "vl %%v22,48(%%r1,%1) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vl %%v24,64(%%r1,%1) \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vl %%v26,80(%%r1,%1) \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vl %%v28,96(%%r1,%1) \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vl %%v30,112(%%r1,%1) \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "vl %%v2,16(%%r1,%4) \n\t" + "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" + "vst %%v2,16(%%r1,%4) \n\t" + + "vl %%v2,32(%%r1,%4) \n\t" + "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" + "vst %%v2,32(%%r1,%4) \n\t" + + "vl %%v2,48(%%r1,%4) \n\t" + "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" + "vst %%v2,48(%%r1,%4) \n\t" + + "vl %%v2,64(%%r1,%4) \n\t" + "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" + "vst %%v2,64(%%r1,%4) \n\t" + + "vl %%v2,80(%%r1,%4) \n\t" + "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" + "vst %%v2,80(%%r1,%4) \n\t" + + "vl %%v2,96(%%r1,%4) \n\t" + "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" + "vst %%v2,96(%%r1,%4) \n\t" + + "vl %%v2,112(%%r1,%4) \n\t" + "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" + "vst %%v2,112(%%r1,%4) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,0(%%r1,%2) \n\t" + + "vl %%v2,0(%%r1,%4) \n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" + "vst %%v2,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vlrepf %%v0,0(%2) \n\t" + "vlrepf %%v1,%4 \n\t" + "vfmsb %%v0,%%v0,%%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%1) \n\t" + "vl %%v21,80(%%r1,%1) \n\t" + "vl %%v22,96(%%r1,%1) \n\t" + "vl %%v23,112(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "vl %%v1,16(%%r1,%3) \n\t" + "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" + "vst %%v1,16(%%r1,%3) \n\t" + + "vl %%v1,32(%%r1,%3) \n\t" + "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" + "vst %%v1,32(%%r1,%3) \n\t" + + "vl %%v1,48(%%r1,%3) \n\t" + "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" + "vst %%v1,48(%%r1,%3) \n\t" + + "vl %%v1,64(%%r1,%3) \n\t" + "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" + "vst %%v1,64(%%r1,%3) \n\t" + + "vl %%v1,80(%%r1,%3) \n\t" + "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" + "vst %%v1,80(%%r1,%3) \n\t" + + "vl %%v1,96(%%r1,%3) \n\t" + "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" + "vst %%v1,96(%%r1,%3) \n\t" + + "vl %%v1,112(%%r1,%3) \n\t" + "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" + "vst %%v1,112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%1) \n\t" + + "vl %%v1,0(%%r1,%3) \n\t" + "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" + "vst %%v1,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i]; + dest += inc_dest; + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8],*ybuffer; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + ybuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; + + } + + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); + } + + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); + } + + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); + } + + + return(0); +} + + diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c new file mode 100644 index 000000000..efc06297f --- /dev/null +++ b/kernel/zarch/sgemv_t_4.c @@ -0,0 +1,826 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vrepf %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,2 \n\t" + "aebr %%f0,%%f4 \n\t" + "vrepf %%v4,%%v0,3 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "vrepf %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,2 \n\t" + "aebr %%f1,%%f4 \n\t" + "vrepf %%v4,%%v1,3 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "vrepf %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,2 \n\t" + "aebr %%f2,%%f4 \n\t" + "vrepf %%v4,%%v2,3 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "vrepf %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,2 \n\t" + "aebr %%f3,%%f4 \n\t" + "vrepf %%v4,%%v3,3 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,2 \n\t" + "aebr %%f0,%%f2 \n\t" + "vrepf %%v2,%%v0,3 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "vrepf %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,2 \n\t" + "aebr %%f1,%%f2 \n\t" + "vrepf %%v2,%%v1,3 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "vrepf %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,2 \n\t" + "aebr %%f0,%%f1 \n\t" + "vrepf %%v1,%%v0,3 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { + + maxf = smax_kernel_64(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c new file mode 100644 index 000000000..e882b7ff1 --- /dev/null +++ b/kernel/zarch/smin.c @@ -0,0 +1,186 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchsb %%v24,%%v17,%%v16 \n\t" + "vfchsb %%v25,%%v19,%%v18 \n\t" + "vfchsb %%v26,%%v21,%%v20 \n\t" + "vfchsb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchsb %%v28,%%v25,%%v24 \n\t" + "vfchsb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchsb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchsb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "veslg %%v16,%%v0,32 \n\t" + "vfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + + "vrepf %%v16,%%v0,2 \n\t" + "wfchsb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ler %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -64; + if (n1 > 0) { + + minf = smin_kernel_64(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c new file mode 100644 index 000000000..763cc664a --- /dev/null +++ b/kernel/zarch/srot.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) +{ + __asm__ ( + "vlrepf %%v0,%3 \n\t" + "vlrepf %%v1,%4 \n\t" + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmsb %%v28,%%v24,%%v0 \n\t" + "vfmsb %%v29,%%v25,%%v0 \n\t" + "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0 \n\t" + "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0 \n\t" + "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + FLOAT cosa,sina; + cosa=c; + sina=s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i=n1; + } + + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; + + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + +} + + diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c new file mode 100644 index 000000000..c18a7e56f --- /dev/null +++ b/kernel/zarch/sscal.c @@ -0,0 +1,201 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 0(%%r1,%2) \n\t" + "vl %%v25, 16(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 16(%%r1,%2) \n\t" + "vl %%v26, 32(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 32(%%r1,%2) \n\t" + "vl %%v27, 48(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 48(%%r1,%2) \n\t" + "vl %%v24, 64(%%r1,%2) \n\t" + "vfmsb %%v24,%%v24,%%v0 \n\t" + "vst %%v24, 64(%%r1,%2) \n\t" + "vl %%v25, 80(%%r1,%2) \n\t" + "vfmsb %%v25,%%v25,%%v0 \n\t" + "vst %%v25, 80(%%r1,%2) \n\t" + "vl %%v26, 96(%%r1,%2) \n\t" + "vfmsb %%v26,%%v26,%%v0 \n\t" + "vst %%v26, 96(%%r1,%2) \n\t" + "vl %%v27, 112(%%r1,%2) \n\t" + "vfmsb %%v27,%%v27,%%v0 \n\t" + "vst %%v27, 112(%%r1,%2) \n\t" + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v24","v25","v26","v27" + ); +} + +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); + + + if ( inc_x == 1 ) + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + sscal_kernel_32_zero(n1, x); + j=n1; + } + + while(j < n) + { + + x[j]=0.0; + j++; + } + + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + sscal_kernel_32(n1, da, x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } + + + } + else + { + + if ( da == 0.0 ) + { + + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i]=0.0; + x[i + inc_x]=0.0; + + i += inc_x * 2; + j += 2; + + } + while(j < n) + { + + x[i]=0.0; + i += inc_x ; + j++; + } + + } + else + { + BLASLONG n1 = n & -2; + + while (j < n1) { + + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + + i += inc_x * 2; + j += 2; + + } + + while(j < n) + { + + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } + + } + return 0; + +} + + diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c new file mode 100644 index 000000000..d0c0dc3f4 --- /dev/null +++ b/kernel/zarch/sswap.c @@ -0,0 +1,164 @@ +/*************************************************************************** +Copyright (c) 2013-2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "srlg %%r0,%0,6 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -64; + if ( n1 > 0 ) + { + sswap_kernel_64(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } + + + } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + + } + return(0); + + +} + + diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c new file mode 100644 index 000000000..6393b099b --- /dev/null +++ b/kernel/zarch/zamax.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + + i = n1; + } + else + { + maxf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) > maxf) { + maxf = ABS(x[i*2]); + } + i++; + } + return (maxf); + + } else { + + inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) > maxf) { + maxf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) > maxf) { + maxf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) > maxf) { + maxf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) > maxf) { + maxf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c new file mode 100644 index 000000000..b15774bb9 --- /dev/null +++ b/kernel/zarch/zamin.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + + i = n1; + } + else + { + minf=CABS1(x,0); + i++; + } + + while (i < n) { + if (ABS(x[i*2]) < minf) { + minf = ABS(x[i*2]); + } + i++; + } + return (minf); + + } else { + + inc_x2 = 2 * inc_x; + minf=CABS1(x,0); + i += inc_x2; + j++; + + BLASLONG n1 = (n - 1) & -4; + while (j < n1) { + + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + if (CABS1(x,i+inc_x2) < minf) { + minf = CABS1(x,i+inc_x2); + } + if (CABS1(x,i+inc_x2*2) < minf) { + minf = CABS1(x,i+inc_x2*2); + } + if (CABS1(x,i+inc_x2*3) < minf) { + minf = CABS1(x,i+inc_x2*3); + } + + i += inc_x2 * 4; + + j += 4; + + } + + + while (j < n) { + if (CABS1(x,i) < minf) { + minf = CABS1(x,i); + } + i += inc_x2; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 0fc5c9ecb..8faaf20eb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -25,92 +25,98 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" #include #if defined(DOUBLE) - #define ABS fabs - #else - #define ABS fabsf - #endif - -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) +{ FLOAT asum; - __asm__ ( - "pfd 1, 0(%[ptr_x]) \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[ptr_x] \n\t" - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v22 \n\t" - "vzero %%v23 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%[ptr_tmp] ) \n\t" - "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" - - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" - "vflpdb %%v24, %%v24 \n\t" - "vflpdb %%v25, %%v25 \n\t" - "vflpdb %%v26, %%v26 \n\t" - "vflpdb %%v27, %%v27 \n\t" - "vflpdb %%v28, %%v28 \n\t" - "vflpdb %%v29, %%v29 \n\t" - "vflpdb %%v30, %%v30 \n\t" - "vflpdb %%v31, %%v31 \n\t" - "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" - "vfadb %%v0,%%v0,%%v24 \n\t" - "vfadb %%v1,%%v1,%%v25 \n\t" - "vfadb %%v23,%%v23,%%v26 \n\t" - "vfadb %%v22,%%v22,%%v27 \n\t" - "vfadb %%v0,%%v0,%%v28 \n\t" - "vfadb %%v1,%%v1,%%v29 \n\t" - "vfadb %%v23,%%v23,%%v30 \n\t" - "vfadb %%v22,%%v22,%%v31 \n\t" - - "clgrjl %[ptr_tmp],%%r0,1b \n\t" - "vfadb %%v24,%%v0,%%v1 \n\t" - "vfadb %%v25,%%v23,%%v22 \n\t" - "vfadb %%v0,%%v25,%%v24 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %[asum] ,%%f0" - : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) - : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) - : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return asum; + __asm__ ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + "vl %%v20, 64(%%r1,%2) \n\t" + "vl %%v21, 80(%%r1,%2) \n\t" + "vl %%v22, 96(%%r1,%2) \n\t" + "vl %%v23, 112(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + "vl %%v20, 192(%%r1,%2) \n\t" + "vl %%v21, 208(%%r1,%2) \n\t" + "vl %%v22, 224(%%r1,%2) \n\t" + "vl %%v23, 240(%%r1,%2) \n\t" + + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfadb %%v0,%%v0,%%v16 \n\t" + "vfadb %%v1,%%v1,%%v17 \n\t" + "vfadb %%v2,%%v2,%%v18 \n\t" + "vfadb %%v3,%%v3,%%v19 \n\t" + "vfadb %%v0,%%v0,%%v20 \n\t" + "vfadb %%v1,%%v1,%%v21 \n\t" + "vfadb %%v2,%%v2,%%v22 \n\t" + "vfadb %%v3,%%v3,%%v23 \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v0,%%v0,%%v1 \n\t" + "vfadb %%v0,%%v0,%%v2 \n\t" + "vfadb %%v0,%%v0,%%v3 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %0,%%f0 " + :"=f"(asum) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" + ); + return asum; } - - FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i=0; @@ -128,7 +134,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( n1 > 0 ) { - sumf=zasum_kernel_16(n1, x ); + sumf = zasum_kernel_16(n1, x); i=n1; ip=2*n1; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 212de25c8..6ba44a27c 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -23,142 +23,98 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" - -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { - - BLASLONG tempR1 ; - __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" - "pfd 2, 0(%[y_tmp]) \n\t" +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile( #if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" - "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 1, 256(%[t1],%[x_tmp]) \n\t" - "pfd 2, 256(%[t1],%[y_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" - "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" - "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" - "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" - "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" - "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" - "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" - "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" - "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" - "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" - "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" - - : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - + "vlrepg %%v0,0(%3) \n\t" + "vleg %%v1,8(%3),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%3),1 \n\t" +#else + "vleg %%v0,0(%3),1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,0(%3),0 \n\t" + "vlrepg %%v1,8(%3) \n\t" +#endif + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "vl %%v16,64(%%r1,%1) \n\t" + "vl %%v17,80(%%r1,%1) \n\t" + "vl %%v18,96(%%r1,%1) \n\t" + "vl %%v19,112(%%r1,%1) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,64(%%r1,%2) \n\t" + "vst %%v29,80(%%r1,%2) \n\t" + "vst %%v30,96(%%r1,%2) \n\t" + "vst %%v31,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; + FLOAT da[2]; if (n <= 0) return (0); @@ -166,8 +122,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; - if (n1) { - zaxpy_kernel_8(n1, x, y, da_r,da_i); + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); ix = 2 * n1; } i = n1; diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index b5bf383f7..8c940bba3 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,71 +24,28 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - -#include "common.h" - -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) - : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; +#include "common.h" +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "lgr %%r1,%1 \n\t" + "lgr %%r2,%2 \n\t" + "srlg %%r0,%0,4 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1) \n\t" + "pfd 2, 1024(%%r2) \n\t" + "mvc 0(256,%%r2),0(%%r1) \n\t" + "agfi %%r1,256 \n\t" + "agfi %%r2,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","r2" + ); } - int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i=0; @@ -137,9 +94,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) } } - return(0); - + return(0); } - - diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 61c5d6b98..aab18e2e9 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -23,137 +23,92 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - +*****************************************************************************/ #include "common.h" -#if defined(Z13) - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) +{ __asm__ volatile( - "pfd 1, 0(%[ptr_x_tmp]) \n\t" - "pfd 1, 0(%[ptr_y_tmp]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %[n_tmp],%[n_tmp],3 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" - "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - - "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" - "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" - "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" - "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" - "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" - "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" - "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" - - - "la %%r1,128(%%r1) \n\t" - "brctg %[n_tmp],1b \n\t" - "vfadb %%v24,%%v26,%%v24 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vsteg %%v24, 0(%[ptr_d]),0 \n\t" - "vsteg %%v24, 8(%[ptr_d]),1 \n\t" - "vsteg %%v25,16(%[ptr_d]),1 \n\t" - "vsteg %%v25,24(%[ptr_d]),0 \n\t" - : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) - : [mem_x] "m"( *(const double (*)[2*n])x), - [mem_y] "m"( *(const double (*)[2*n])y), - [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) - : "cc", "r1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - -} - -#else - -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - BLASLONG register i = 0; - FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; - BLASLONG j = 0; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - dot[0] += x[j + 2] * y[j + 2]; - dot[1] += x[j + 3] * y[j + 3]; - dot[2] += x[j + 2] * y[j + 3]; - dot[3] += x[j + 3] * y[j + 2]; - - dot[0] += x[j + 4] * y[j + 4]; - dot[1] += x[j + 5] * y[j + 5]; - dot[2] += x[j + 4] * y[j + 5]; - dot[3] += x[j + 5] * y[j + 4]; - - dot[0] += x[j + 6] * y[j + 6]; - dot[1] += x[j + 7] * y[j + 7]; - dot[2] += x[j + 6] * y[j + 7]; - dot[3] += x[j + 7] * y[j + 6]; - - j += 8; - i += 4; - - } - d[0] = dot[0]; - d[1] = dot[1]; - d[2] = dot[2]; - d[3] = dot[3]; - + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "vzero %%v28 \n\t" + "vzero %%v29 \n\t" + "vzero %%v30 \n\t" + "vzero %%v31 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%1) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "vl %%v16, 64(%%r1,%1) \n\t" + "vl %%v17, 80(%%r1,%1) \n\t" + "vl %%v18, 96(%%r1,%1) \n\t" + "vl %%v19, 112(%%r1,%1) \n\t" + "vl %%v0, 64(%%r1,%2) \n\t" + "vl %%v1, 80(%%r1,%2) \n\t" + "vl %%v2, 96(%%r1,%2) \n\t" + "vl %%v3, 112(%%r1,%2) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + "vfadb %%v24,%%v24,%%v26 \n\t" + "vfadb %%v24,%%v24,%%v28 \n\t" + "vfadb %%v24,%%v24,%%v30 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vfadb %%v25,%%v25,%%v29 \n\t" + "vfadb %%v25,%%v25,%%v31 \n\t" + "vsteg %%v24,0(%3),0 \n\t" + "vsteg %%v24,8(%3),1 \n\t" + "vsteg %%v25,16(%3),1 \n\t" + "vsteg %%v25,24(%3),0 " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) + :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix=0, iy=0; + BLASLONG i; + BLASLONG ix, iy; OPENBLAS_COMPLEX_FLOAT result; FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; @@ -167,14 +122,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { BLASLONG n1 = n & -8; - BLASLONG j=0; - if (n1){ + if (n1) zdot_kernel_8(n1, x, y, dot); - i = n1; - j = n1 <<1; - } - + + i = n1; + BLASLONG j = i * 2; while (i < n) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 380f0140e..75027a06c 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,176 +27,166 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__ ( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "lgdr %%r1,%[cos] \n\t" - "vlvgp %%v0,%%r1,%%r1 \n\t" - "lgdr %%r1,%[sin] \n\t" - "vlvgp %%v1,%%r1,%%r1 \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v27,112(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v19,112(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "clgrjl %%r1,%[tmp],1b \n\t" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) - : "cc","r1" ,"v0","v1","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ ( + "vlrepg %%v0,%3 \n\t" + "vlrepg %%v1,%4 \n\t" + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + "vl %%v24, 0(%%r1,%1) \n\t" + "vl %%v25, 16(%%r1,%1) \n\t" + "vl %%v26, 32(%%r1,%1) \n\t" + "vl %%v27, 48(%%r1,%1) \n\t" + "vl %%v16, 0(%%r1,%2) \n\t" + "vl %%v17, 16(%%r1,%2) \n\t" + "vl %%v18, 32(%%r1,%2) \n\t" + "vl %%v19, 48(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%1) \n\t" + "vst %%v29, 16(%%r1,%1) \n\t" + "vst %%v30, 32(%%r1,%1) \n\t" + "vst %%v31, 48(%%r1,%1) \n\t" + "vst %%v20, 0(%%r1,%2) \n\t" + "vst %%v21, 16(%%r1,%2) \n\t" + "vst %%v22, 32(%%r1,%2) \n\t" + "vst %%v23, 48(%%r1,%2) \n\t" + + "vl %%v24, 64(%%r1,%1) \n\t" + "vl %%v25, 80(%%r1,%1) \n\t" + "vl %%v26, 96(%%r1,%1) \n\t" + "vl %%v27, 112(%%r1,%1) \n\t" + "vl %%v16, 64(%%r1,%2) \n\t" + "vl %%v17, 80(%%r1,%2) \n\t" + "vl %%v18, 96(%%r1,%2) \n\t" + "vl %%v19, 112(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%1) \n\t" + "vst %%v29, 80(%%r1,%1) \n\t" + "vst %%v30, 96(%%r1,%1) \n\t" + "vst %%v31, 112(%%r1,%1) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v16, 128(%%r1,%2) \n\t" + "vl %%v17, 144(%%r1,%2) \n\t" + "vl %%v18, 160(%%r1,%2) \n\t" + "vl %%v19, 176(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%1) \n\t" + "vst %%v29, 144(%%r1,%1) \n\t" + "vst %%v30, 160(%%r1,%1) \n\t" + "vst %%v31, 176(%%r1,%1) \n\t" + "vst %%v20, 128(%%r1,%2) \n\t" + "vst %%v21, 144(%%r1,%2) \n\t" + "vst %%v22, 160(%%r1,%2) \n\t" + "vst %%v23, 176(%%r1,%2) \n\t" + + "vl %%v24, 192(%%r1,%1) \n\t" + "vl %%v25, 208(%%r1,%1) \n\t" + "vl %%v26, 224(%%r1,%1) \n\t" + "vl %%v27, 240(%%r1,%1) \n\t" + "vl %%v16, 192(%%r1,%2) \n\t" + "vl %%v17, 208(%%r1,%2) \n\t" + "vl %%v18, 224(%%r1,%2) \n\t" + "vl %%v19, 240(%%r1,%2) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%1) \n\t" + "vst %%v29, 208(%%r1,%1) \n\t" + "vst %%v30, 224(%%r1,%1) \n\t" + "vst %%v31, 240(%%r1,%1) \n\t" + "vst %%v20, 192(%%r1,%2) \n\t" + "vst %%v21, 208(%%r1,%2) \n\t" + "vst %%v22, 224(%%r1,%2) \n\t" + "vst %%v23, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) @@ -214,8 +204,11 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT BLASLONG n1 = n & -16; if ( n1 > 0 ) - { - zrot_kernel_16(n1, x, y, c, s); + { + FLOAT cosa,sina; + cosa=c; + sina=s; + zrot_kernel_16(n1, x, y, &cosa, &sina); i=n1; ix=2*n1; } @@ -234,6 +227,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + } else { @@ -259,3 +253,4 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4764c0a52..4d8ee960f 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -23,270 +23,211 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include "common.h" - - -static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { - BLASLONG tempR1 ; - __asm__ ( - "pfd 2, 0(%[x_tmp]) \n\t" -#if !defined(CONJ) - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v29,%%v29 \n\t" //complement both - "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} - -#else - "lgdr %[t1],%[alpha_i] \n\t" - "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint - "lgdr %[t1],%[alpha_r] \n\t" - "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint - "vflcdb %%v28,%%v28 \n\t" //complement both - "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} -#endif - - "xgr %[t1],%[t1] \n\t" - "sllg %[tmp],%[tmp],4 \n\t" - "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" - "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" - - "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition - "j 2f \n\t" - ".align 16 \n\t" - "1: \n\t" - - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - "2: \n\t" - "pfd 2, 256(%[t1],%[x_tmp]) \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - - "vfmdb %%v30, %%v20, %%v28 \n\t" - "vfmdb %%v31, %%v21, %%v28 \n\t" - "vfmdb %%v6, %%v22, %%v28 \n\t" - "vfmdb %%v7, %%v23, %%v28 \n\t" - - "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" - "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" - "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" - "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" - - "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" - "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" - "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" - "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" - - - "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" - - "la %[t1],64(%[t1] ) \n\t" - - - "clgrjl %[t1],%[tmp],1b \n\t" -//---------------------------------------------------------------------- - "vfmdb %%v16, %%v20, %%v28 \n\t" - "vfmdb %%v17, %%v21, %%v28 \n\t" - "vfmdb %%v18, %%v22, %%v28 \n\t" - "vfmdb %%v19, %%v23, %%v28 \n\t" - "vpdi %%v24 , %%v20, %%v20, 4 \n\t" - "vpdi %%v25 , %%v21, %%v21, 4 \n\t" - "vpdi %%v26 , %%v22, %%v22, 4 \n\t" - "vpdi %%v27 , %%v23, %%v23, 4 \n\t" - "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" - "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" - "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" - "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" - - "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" - "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" - "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" - "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" - - : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) - : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) - : "cc", "v6","v7", "v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - - +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "vleg %%v1,8(%1),0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + "vpdi %%v28,%%v20,%%v20,4 \n\t" + "vpdi %%v29,%%v21,%%v21,4 \n\t" + "vpdi %%v30,%%v22,%%v22,4 \n\t" + "vpdi %%v31,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%1) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint - "vflcdb %%v16,%%v16 \n\t" //complement both - "vlvgg %%v16,%%r0,0 \n\t" //restore 1st - "vlr %%v17 ,%%v16 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v16 \n\t" - "vsteg %%v24, 0(%[x_ptr]),1 \n\t" - "vsteg %%v24, 8(%[x_ptr]),0 \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v17 \n\t" - "vsteg %%v25, 16(%[x_ptr]),1 \n\t" - "vsteg %%v25, 24(%[x_ptr]),0 \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vsteg %%v26, 32(%[x_ptr]),1 \n\t" - "vsteg %%v26, 40(%[x_ptr]),0 \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vsteg %%v27, 48(%[x_ptr]),1 \n\t" - "vsteg %%v27, 56(%[x_ptr]),0 \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v16 \n\t" - "vsteg %%v28, 64(%[x_ptr]),1 \n\t" - "vsteg %%v28, 72(%[x_ptr]),0 \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v17 \n\t" - "vsteg %%v29, 80(%[x_ptr]),1 \n\t" - "vsteg %%v29, 88(%[x_ptr]),0 \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vsteg %%v30, 96(%[x_ptr]),1 \n\t" - "vsteg %%v30, 104(%[x_ptr]),0 \n\t" - "vl %%v31, 112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vsteg %%v31, 112(%[x_ptr]),1 \n\t" - "vsteg %%v31, 120(%[x_ptr]),0 \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_i) - :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - - +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vleg %%v0,8(%1),0 \n\t" + "wflcdb %%v0,%%v0 \n\t" + "vleg %%v0,8(%1),1 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vpdi %%v16,%%v16,%%v16,4 \n\t" + "vpdi %%v17,%%v17,%%v17,4 \n\t" + "vpdi %%v18,%%v18,%%v18,4 \n\t" + "vpdi %%v19,%%v19,%%v19,4 \n\t" + "vpdi %%v20,%%v20,%%v20,4 \n\t" + "vpdi %%v21,%%v21,%%v21,4 \n\t" + "vpdi %%v22,%%v22,%%v22,4 \n\t" + "vpdi %%v23,%%v23,%%v23,4 \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { - __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" - "lgdr %%r0,%[alpha] \n\t" - "vlvgp %%v18,%%r0,%%r0 \n\t" - "vlr %%v19,%%v18 \n\t" - "vlr %%v16,%%v18 \n\t" - "vlr %%v17,%%v18 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "vl %%v24, 0(%[x_ptr]) \n\t" - "vfmdb %%v24,%%v24,%%v18 \n\t" - "vst %%v24, 0(%[x_ptr]) \n\t" - "vl %%v25, 16(%[x_ptr]) \n\t" - "vfmdb %%v25,%%v25,%%v19 \n\t" - "vst %%v25, 16(%[x_ptr]) \n\t" - "vl %%v26, 32(%[x_ptr]) \n\t" - "vfmdb %%v26,%%v26,%%v16 \n\t" - "vst %%v26, 32(%[x_ptr]) \n\t" - "vl %%v27, 48(%[x_ptr]) \n\t" - "vfmdb %%v27,%%v27,%%v17 \n\t" - "vst %%v27, 48(%[x_ptr]) \n\t" - "vl %%v28, 64(%[x_ptr]) \n\t" - "vfmdb %%v28,%%v28,%%v18 \n\t" - "vst %%v28, 64(%[x_ptr]) \n\t" - "vl %%v29, 80(%[x_ptr]) \n\t" - "vfmdb %%v29,%%v29,%%v19 \n\t" - "vst %%v29, 80(%[x_ptr]) \n\t" - "vl %%v30, 96(%[x_ptr]) \n\t" - "vfmdb %%v30,%%v30,%%v16 \n\t" - "vst %%v30, 96(%[x_ptr]) \n\t" - "vl %%v31,112(%[x_ptr]) \n\t" - "vfmdb %%v31,%%v31,%%v17 \n\t" - "vst %%v31,112(%[x_ptr]) \n\t" - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) - : [n] "r"(n),[alpha] "f"(da_r) - : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" - ); - +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) +{ + __asm__ volatile( + "vlrepg %%v0,0(%1) \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v0 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v0 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v0 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v0 \n\t" + + "vst %%v16,0(%%r1,%2) \n\t" + "vst %%v17,16(%%r1,%2) \n\t" + "vst %%v18,32(%%r1,%2) \n\t" + "vst %%v19,48(%%r1,%2) \n\t" + "vst %%v20,64(%%r1,%2) \n\t" + "vst %%v21,80(%%r1,%2) \n\t" + "vst %%v22,96(%%r1,%2) \n\t" + "vst %%v23,112(%%r1,%2) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - - __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "sllg %%r0,%[n],4 \n\t" - "agr %%r0,%[x_ptr] \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256( %[x_ptr]) \n\t" - "vst %%v24, 0( %[x_ptr]) \n\t" - "vst %%v25, 16( %[x_ptr]) \n\t" - "vst %%v26, 32( %[x_ptr]) \n\t" - "vst %%v27, 48( %[x_ptr]) \n\t" - "vst %%v24, 64( %[x_ptr]) \n\t" - "vst %%v25, 80( %[x_ptr]) \n\t" - "vst %%v26, 96( %[x_ptr]) \n\t" - "vst %%v27,112( %[x_ptr]) \n\t" - - "la %[x_ptr],128(%[x_ptr]) \n\t" - "clgrjl %[x_ptr],%%r0,1b \n\t" - : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) - : [n] "r"(n) - :"cc" ,"r0","v24","v25","v26","v27" - ); - +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) +{ + __asm__ volatile( + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %%r0,%0,3 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + + "vst %%v24,0(%%r1,%1) \n\t" + "vst %%v25,16(%%r1,%1) \n\t" + "vst %%v26,32(%%r1,%1) \n\t" + "vst %%v27,48(%%r1,%1) \n\t" + "vst %%v24,64(%%r1,%1) \n\t" + "vst %%v25,80(%%r1,%1) \n\t" + "vst %%v26,96(%%r1,%1) \n\t" + "vst %%v27,112(%%r1,%1) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x) + :"memory","cc","r0","r1","v24","v25","v26","v27" + ); } - - - - -static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) +{ BLASLONG i; BLASLONG inc_x2 = 2 * inc_x; BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; - for (i = 0; i < n; i += 4) { + for (i = 0; i < n; i += 4) + { t0 = da_r * x[0] - da_i * x[1]; t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; @@ -303,17 +244,14 @@ static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLAS x[inc_x3] = t3; x += 4 * inc_x; - } - - } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0, j = 0; FLOAT temp0; FLOAT temp1; - + FLOAT alpha[2] __attribute__ ((aligned(16))); if (inc_x != 1) { inc_x <<= 1; @@ -405,8 +343,10 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, } else { BLASLONG n1 = n & -8; - if (n1 > 0) { - zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); j = n1; i = n1 * inc_x; } @@ -432,17 +372,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, BLASLONG n1 = n & -8; if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; if (da_r == 0.0) if (da_i == 0) zscal_kernel_8_zero(n1, x); else - zscal_kernel_8_zero_r(n1, da_i, x); + zscal_kernel_8_zero_r(n1, alpha, x); else if (da_i == 0) - zscal_kernel_8_zero_i(n1, da_r, x); + zscal_kernel_8_zero_i(n1, alpha, x); else - zscal_kernel_8(n1, da_r,da_i, x); + zscal_kernel_8(n1, alpha, x); i = n1 << 1; j = n1; @@ -508,5 +450,3 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, return (0); } - - diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 062079002..a16b87cdc 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -25,220 +25,93 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #include "common.h" - -#if defined(Z13_SWAP_A) -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__ volatile( - "pfd 1, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" - - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" - - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" - - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" - - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" - - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" - - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" - - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" - ,"v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - + __asm__ volatile( + "srlg %%r0,%0,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 2, 1024(%%r1,%1) \n\t" + "pfd 2, 1024(%%r1,%2) \n\t" + + "vl %%v16, 0(%%r1,%1) \n\t" + "vl %%v17, 16(%%r1,%1) \n\t" + "vl %%v18, 32(%%r1,%1) \n\t" + "vl %%v19, 48(%%r1,%1) \n\t" + "vl %%v20, 64(%%r1,%1) \n\t" + "vl %%v21, 80(%%r1,%1) \n\t" + "vl %%v22, 96(%%r1,%1) \n\t" + "vl %%v23, 112(%%r1,%1) \n\t" + "vl %%v24, 128(%%r1,%1) \n\t" + "vl %%v25, 144(%%r1,%1) \n\t" + "vl %%v26, 160(%%r1,%1) \n\t" + "vl %%v27, 176(%%r1,%1) \n\t" + "vl %%v28, 192(%%r1,%1) \n\t" + "vl %%v29, 208(%%r1,%1) \n\t" + "vl %%v30, 224(%%r1,%1) \n\t" + "vl %%v31, 240(%%r1,%1) \n\t" + + "vl %%v0, 0(%%r1,%2) \n\t" + "vl %%v1, 16(%%r1,%2) \n\t" + "vl %%v2, 32(%%r1,%2) \n\t" + "vl %%v3, 48(%%r1,%2) \n\t" + "vl %%v4, 64(%%r1,%2) \n\t" + "vl %%v5, 80(%%r1,%2) \n\t" + "vl %%v6, 96(%%r1,%2) \n\t" + "vl %%v7, 112(%%r1,%2) \n\t" + "vst %%v0, 0(%%r1,%1) \n\t" + "vst %%v1, 16(%%r1,%1) \n\t" + "vst %%v2, 32(%%r1,%1) \n\t" + "vst %%v3, 48(%%r1,%1) \n\t" + "vst %%v4, 64(%%r1,%1) \n\t" + "vst %%v5, 80(%%r1,%1) \n\t" + "vst %%v6, 96(%%r1,%1) \n\t" + "vst %%v7, 112(%%r1,%1) \n\t" + + "vl %%v0, 128(%%r1,%2) \n\t" + "vl %%v1, 144(%%r1,%2) \n\t" + "vl %%v2, 160(%%r1,%2) \n\t" + "vl %%v3, 176(%%r1,%2) \n\t" + "vl %%v4, 192(%%r1,%2) \n\t" + "vl %%v5, 208(%%r1,%2) \n\t" + "vl %%v6, 224(%%r1,%2) \n\t" + "vl %%v7, 240(%%r1,%2) \n\t" + "vst %%v0, 128(%%r1,%1) \n\t" + "vst %%v1, 144(%%r1,%1) \n\t" + "vst %%v2, 160(%%r1,%1) \n\t" + "vst %%v3, 176(%%r1,%1) \n\t" + "vst %%v4, 192(%%r1,%1) \n\t" + "vst %%v5, 208(%%r1,%1) \n\t" + "vst %%v6, 224(%%r1,%1) \n\t" + "vst %%v7, 240(%%r1,%1) \n\t" + + "vst %%v16, 0(%%r1,%2) \n\t" + "vst %%v17, 16(%%r1,%2) \n\t" + "vst %%v18, 32(%%r1,%2) \n\t" + "vst %%v19, 48(%%r1,%2) \n\t" + "vst %%v20, 64(%%r1,%2) \n\t" + "vst %%v21, 80(%%r1,%2) \n\t" + "vst %%v22, 96(%%r1,%2) \n\t" + "vst %%v23, 112(%%r1,%2) \n\t" + "vst %%v24, 128(%%r1,%2) \n\t" + "vst %%v25, 144(%%r1,%2) \n\t" + "vst %%v26, 160(%%r1,%2) \n\t" + "vst %%v27, 176(%%r1,%2) \n\t" + "vst %%v28, 192(%%r1,%2) \n\t" + "vst %%v29, 208(%%r1,%2) \n\t" + "vst %%v30, 224(%%r1,%2) \n\t" + "vst %%v31, 240(%%r1,%2) \n\t" + + "agfi %%r1,256 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else - -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "pfd 2, 0(%[ptr_x]) \n\t" - "pfd 2, 0(%[ptr_y]) \n\t" - "srlg %[n_tmp],%[n_tmp],4 \n\t" - "xgr %%r1,%%r1 \n\t" - ".align 16 \n\t" - "1: \n\t" - "pfd 2, 256(%%r1,%[ptr_x]) \n\t" - "pfd 2, 256(%%r1,%[ptr_y]) \n\t" - - "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" - "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" - "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" - "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" - "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" - "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" - "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" - "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" - "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" - "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" - "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" - "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" - "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" - "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" - "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" - "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - - - "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - - "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" - "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" - "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" - "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" - "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" - "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" - "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" - "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" - "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" - "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" - "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" - "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" - "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" - "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" - "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" - "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - - "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" - "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" - "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" - "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" - "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" - "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" - "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" - "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" - "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" - "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" - "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" - "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" - "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" - "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" - "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" - "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" - - - "la %%r1,256(%%r1) \n\t" - "brctg %[n_tmp],1b" - : [mem_x] "+m" (*(double (*)[2*n])x), - [mem_y] "+m" (*(double (*)[2*n])y), - [n_tmp] "+&r"(n) - : [ptr_x] "a"(x), [ptr_y] "a"(y) - : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", - "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - return; - -} - -#endif - - - - - - int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i=0; diff --git a/ztest/Makefile b/ztest/Makefile new file mode 100644 index 000000000..0ff7fe46a --- /dev/null +++ b/ztest/Makefile @@ -0,0 +1,437 @@ +TOPDIR = .. +include $(TOPDIR)/Makefile.system + +goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto + +##################################### Sdot #################################################### +sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ddot #################################################### +ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cdot #################################################### +cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zdot #################################################### +zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dsdot #################################################### +dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMAX ############################################## +isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMAX ############################################## +idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMAX ############################################## +icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMAX ############################################## +izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMAX ############################################## +samax.goto : samax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMAX ############################################## +damax.goto : damax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMAX ############################################## +ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMAX ############################################## +idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMAX ############################################## +smax.goto : smax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMAX ############################################## +dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISAMIN ############################################## +isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDAMIN ############################################## +idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ICAMIN ############################################## +icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IZAMIN ############################################## +izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SAMIN ############################################## +samin.goto : samin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DAMIN ############################################## +damin.goto : damin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## CAMIN ############################################## +camin.goto : camin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ZAMIN ############################################## +zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## ISMIN ############################################## +ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## IDMIN ############################################## +idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## SMIN ############################################## +smin.goto : smin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +############################################## DMIN ############################################## +dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sgemv #################################################### +sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dgemv #################################################### +dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cgemv #################################################### + +cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zgemv #################################################### + +zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sscal #################################################### +sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dscal #################################################### +dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cscal #################################################### + +cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zscal #################################################### + +zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Srot #################################################### +srot.goto : srot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Drot #################################################### +drot.goto : drot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Crot #################################################### +crot.goto : crot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zrot #################################################### +zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sswap #################################################### +sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dswap #################################################### +dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Cswap #################################################### + +cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zswap #################################################### + +zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Saxpy #################################################### +saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Daxpy #################################################### +daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Caxpy #################################################### + +caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zaxpy #################################################### + +zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Sasum #################################################### +sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dasum #################################################### +dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Casum #################################################### + +casum.goto : casum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zasum #################################################### + +zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Scopy #################################################### +scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Dcopy #################################################### +dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Ccopy #################################################### + +ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +##################################### Zcopy #################################################### + +zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm + +################################################################################################### + +sdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +ddot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zdot.$(SUFFIX) : dot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +dsdot.$(SUFFIX) : dsdot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +isamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamax.$(SUFFIX) : iamax.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damax.$(SUFFIX) : amax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ismax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmax.$(SUFFIX) : imax.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmax.$(SUFFIX) : max.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +isamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +icamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +izamin.$(SUFFIX) : iamin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +samin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +damin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +camin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zamin.$(SUFFIX) : amin.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +ismin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +idmin.$(SUFFIX) : imin.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +smin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dmin.$(SUFFIX) : min.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +sgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zgemv.$(SUFFIX) : gemv.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zscal.$(SUFFIX) : scal.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +srot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +drot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +crot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zrot.$(SUFFIX) : rot.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +cswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zswap.$(SUFFIX) : swap.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +saxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +daxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +caxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zaxpy.$(SUFFIX) : axpy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +sasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +casum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zasum.$(SUFFIX) : asum.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +scopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ + +dcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ + +ccopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ + +zcopy.$(SUFFIX) : copy.c + $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ + +clean :: + @rm -f *.goto + diff --git a/ztest/amax.c b/ztest/amax.c new file mode 100644 index 000000000..f2e3f5411 --- /dev/null +++ b/ztest/amax.c @@ -0,0 +1,235 @@ +/*************************************************************************** +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef AMAX +#ifdef DOUBLE +#define AMAX BLASFUNC(damax) +#else +#define AMAX BLASFUNC(samax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef AMIN +#ifdef DOUBLE +#define AMIN BLASFUNC(damin) +#else +#define AMIN BLASFUNC(samin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} +#else +FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} +#endif + +#undef ASUM +#ifdef COMPLEX +#ifdef DOUBLE +#define ASUM BLASFUNC(dzasum) +#else +#define ASUM BLASFUNC(scasum) +#endif +#else +#ifdef DOUBLE +#define ASUM BLASFUNC(dasum) +#else +#define ASUM BLASFUNC(sasum) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + if ( da_r == 0.0 && da_i == 0.0 ) return(0); + + ix = 0; + iy = 0; + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} +#else +int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix,iy; + + if ( n < 0 ) return(0); + if ( da == 0.0 ) return(0); + + ix = 0; + iy = 0; + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef AXPY +#ifdef COMPLEX +#ifdef DOUBLE +#define AXPY BLASFUNC(zaxpy) +#else +#define AXPY BLASFUNC(caxpy) +#endif +#else +#ifdef DOUBLE +#define AXPY BLASFUNC(daxpy) +#else +#define AXPY BLASFUNC(saxpy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c;; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + argc--;argv++; + + blasint iy; + int test = 1; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2; + iy += inc_y2; + i++ ; + + } + return(0); + +} +#else +int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n < 0 ) return(0); + + while(i < n) + { + + y[iy] = x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} +#endif + +#undef COPY +#ifdef COMPLEX +#ifdef DOUBLE +#define COPY BLASFUNC(zcopy) +#else +#define COPY BLASFUNC(ccopy) +#endif +#else +#ifdef DOUBLE +#define COPY BLASFUNC(dcopy) +#else +#define COPY BLASFUNC(scopy) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot[2]; + OPENBLAS_COMPLEX_FLOAT result; + BLASLONG inc_x2; + BLASLONG inc_y2; + + dot[0]=0.0; + dot[1]=0.0; + + CREAL(result) = 0.0 ; + CIMAG(result) = 0.0 ; + + if ( n < 1 ) return(result); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { +#if !defined(CONJ) + dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; + dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; +#else + dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; + dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; +#endif + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + CREAL(result) = dot[0]; + CIMAG(result) = dot[1]; + return(result); + +} +#else +FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} +#endif + +#undef DOT +#ifdef COMPLEX +#ifdef DOUBLE +#define DOT BLASFUNC(zdotu) +#else +#define DOT BLASFUNC(cdotu) +#endif +#else +#ifdef DOUBLE +#define DOT BLASFUNC(ddot) +#else +#define DOT BLASFUNC(sdot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; +#ifdef COMPLEX + OPENBLAS_COMPLEX_FLOAT result, result_c; +#else + FLOAT result, result_c; +#endif + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + double dot = 0.0 ; + + if ( n < 0 ) return(dot); + + while(i < n) + { + + dot += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(dot); +} + +#undef DSDOT +#define DSDOT BLASFUNC(dsdot) + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y; + double result, result_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp_r,temp_i; + BLASLONG inc_x2,inc_y2; + BLASLONG lda2; + BLASLONG i2; + + lda2 = 2*lda; + + ix = 0; + a_ptr = a; + + if ( inc_x == 1 && inc_y == 1 ) + { + + for (j=0; jtv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *a, *x, *y, *y_c; + FLOAT alpha[] = {1.0, 1.0}; + FLOAT beta [] = {1.0, 1.0}; + char trans='N'; + blasint m, i, j; + blasint inc_x=1,inc_y=1; + blasint n=0; + int has_param_n = 0; + int has_param_m = 0; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + + int tomax = to; + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + if ((n>0)) has_param_n = 1; + if ( n > tomax ) tomax = n; + } + if ( has_param_n == 0 ) + if ((p = getenv("OPENBLAS_PARAM_M"))) { + m = atoi(p); + if ((m>0)) has_param_m = 1; + if ( m > tomax ) tomax = m; + } + + + + fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); + + if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + if (has_param_m == 0) + { + + for(m = from; m <= to; m += step) + { + timeg=0; + timeg_c=0; + if ( has_param_n == 0 ) n = m; + fprintf(stderr, " %6dx%d :", (int)m,(int)n); + for(j = 0; j < m; j++){ + for(i = 0; i < n * COMPSIZE; i++){ + a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + } + } + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} +#else +BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} +#endif + +#undef IAMAX +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMAX BLASFUNC(izamax) +#else +#define IAMAX BLASFUNC(icamax) +#endif +#else +#ifdef DOUBLE +#define IAMAX BLASFUNC(idamax) +#else +#define IAMAX BLASFUNC(isamax) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif +#ifdef COMPLEX +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) +BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} +#else +BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} +#endif + +#undef IAMIN +#ifdef COMPLEX +#ifdef DOUBLE +#define IAMIN BLASFUNC(izamin) +#else +#define IAMIN BLASFUNC(icamin) +#endif +#else +#ifdef DOUBLE +#define IAMIN BLASFUNC(idamin) +#else +#define IAMIN BLASFUNC(isamin) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + +#undef IMAX +#ifdef DOUBLE +#define IMAX BLASFUNC(idmax) +#else +#define IMAX BLASFUNC(ismax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + +#undef IMIN +#ifdef DOUBLE +#define IMIN BLASFUNC(idmin) +#else +#define IMIN BLASFUNC(ismin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + BLASLONG result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + +#undef MAX_ +#ifdef DOUBLE +#define MAX_ BLASFUNC(dmax) +#else +#define MAX_ BLASFUNC(smax) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + +#undef MIN_ +#ifdef DOUBLE +#define MIN_ BLASFUNC(dmin) +#else +#define MIN_ BLASFUNC(smin) +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x; + FLOAT result, result_c; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n <= 0 ) return(0); + + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); + + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef ROT +#ifdef COMPLEX +#ifdef DOUBLE +#define ROT BLASFUNC(zdrot) +#else +#define ROT BLASFUNC(csrot) +#endif +#else +#ifdef DOUBLE +#define ROT BLASFUNC(drot) +#else +#define ROT BLASFUNC(srot) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + // FLOAT result; + blasint m, i; + blasint inc_x=1,inc_y=1; + FLOAT c[1] = { 2.0 }; + FLOAT s[1] = { 2.0 }; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG inc_x2; + BLASLONG ip = 0; + FLOAT temp; + + if ( (n <= 0) || (inc_x <= 0)) + return(0); + + inc_x2 = 2 * inc_x; + for ( i=0; itv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *x_c; + FLOAT alpha[2] = { 2.0, 2.0 }; + blasint m, i; + blasint inc_x=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#define SINGLE_EPS 1e-04 +#define DOUBLE_EPS 1e-13 + +int assert_dbl_near(double exp, double real, double tol) { + double diff = exp - real; + double absdiff = diff; + /* avoid using fabs and linking with a math lib */ + if(diff < 0) { + absdiff *= -1; + } + if (absdiff > tol) { + return 0; + } + return 1; +} + +#ifdef COMPLEX +int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); +} +#else +int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); +} +#endif + +#undef SWAP +#ifdef COMPLEX +#ifdef DOUBLE +#define SWAP BLASFUNC(zswap) +#else +#define SWAP BLASFUNC(cswap) +#endif +#else +#ifdef DOUBLE +#define SWAP BLASFUNC(dswap) +#else +#define SWAP BLASFUNC(sswap) +#endif +#endif + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + +#define malloc huge_malloc + +#endif + +int main(int argc, char *argv[]){ + + FLOAT *x, *y, *x_c, *y_c; + blasint m, i; + blasint inc_x=1,inc_y=1; + int loops = 1; + int l; + char *p; + + int from = 1; + int to = 200; + int step = 1; + + struct timeval start, stop; + double time1,timeg,timeg_c; + + blasint ix,iy; + int test = 1; + + argc--;argv++; + + if (argc > 0) { from = atol(*argv); argc--; argv++;} + if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} + if (argc > 0) { step = atol(*argv); argc--; argv++;} + + if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); + + if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + + if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ + fprintf(stderr,"Out of Memory!!\n");exit(1); + } + +#ifdef linux + srandom(getpid()); +#endif + + fprintf(stderr, " SIZE Flops Time CTime Test\n"); + + for(m = from; m <= to; m += step) + { + + timeg=0; + timeg_c=0; + + fprintf(stderr, " %6d :", (int)m); + + + for (l=0; l Date: Mon, 6 Aug 2018 20:03:49 +0300 Subject: [PATCH 002/133] [ZARCH] Restore detect() function --- cpuid_zarch.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 0ae32f27d..073419fa8 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -45,9 +45,29 @@ static char *cpuname_lower[] = { int detect(void) { - // return CPU_GENERIC; - return CPU_Z14; - + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + infile = fopen("/proc/sysinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("Type", buffer, 4)){ + p = strchr(buffer, ':') + 2; +#if 0 + fprintf(stderr, "%s\n", p); +#endif + break; + } + } + + fclose(infile); + + if (strstr(p, "2964")) return CPU_Z13; + if (strstr(p, "2965")) return CPU_Z13; + if (strstr(p, "3906")) return CPU_Z14; + if (strstr(p, "3907")) return CPU_Z14; + + return CPU_GENERIC; } void get_libname(void) From e6c0e39492d49eded5a72c9882b79bed7bff35d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 13 Aug 2018 12:23:40 +0300 Subject: [PATCH 003/133] Optimize Zgemv --- cpuid_zarch.c | 8 +- kernel/zarch/KERNEL.Z13 | 4 +- kernel/zarch/KERNEL.Z14 | 8 +- kernel/zarch/camax.c | 46 +- kernel/zarch/camin.c | 46 +- kernel/zarch/caxpy.c | 4 +- kernel/zarch/cgemv_n_4.c | 743 ++++++++++++++++++++ kernel/zarch/cgemv_t_4.c | 671 ++++++++++++++++++ kernel/zarch/icamax.c | 9 +- kernel/zarch/icamin.c | 9 +- kernel/zarch/idamax.c | 11 +- kernel/zarch/idamin.c | 11 +- kernel/zarch/idmax.c | 11 +- kernel/zarch/idmin.c | 11 +- kernel/zarch/isamax.c | 11 +- kernel/zarch/isamin.c | 11 +- kernel/zarch/ismax.c | 11 +- kernel/zarch/ismin.c | 11 +- kernel/zarch/izamax.c | 9 +- kernel/zarch/izamin.c | 9 +- kernel/zarch/zamax.c | 48 +- kernel/zarch/zamin.c | 46 +- kernel/zarch/zaxpy.c | 4 +- kernel/zarch/zgemv_n_4.c | 1401 ++++++++++++++++---------------------- kernel/zarch/zgemv_t_4.c | 1267 +++++++++++++++------------------- ztest/gemv.c | 159 +++-- 26 files changed, 2866 insertions(+), 1713 deletions(-) create mode 100644 kernel/zarch/cgemv_n_4.c create mode 100644 kernel/zarch/cgemv_t_4.c diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 073419fa8..8ed40099b 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,9 +27,9 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", @@ -112,7 +112,7 @@ void get_cpuconfig(void) printf("#define Z13\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; - case CPU_Z14: + case CPU_Z14: printf("#define Z14\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); break; diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index d39b9d904..e5b974ab4 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -74,12 +74,12 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n_4.c CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index fa88b6881..80f78f48f 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -73,13 +73,13 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n_4.c DGEMVNKERNEL = dgemv_n_4.c -CGEMVNKERNEL = ../arm/zgemv_n.c -ZGEMVNKERNEL = ../arm/zgemv_n.c +CGEMVNKERNEL = cgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_4.c SGEMVTKERNEL = sgemv_t_4.c DGEMVTKERNEL = dgemv_t_4.c -CGEMVTKERNEL = ../arm/zgemv_t.c -ZGEMVTKERNEL = ../arm/zgemv_t.c +CGEMVTKERNEL = cgemv_t_4.c +ZGEMVTKERNEL = zgemv_t_4.c STRMMKERNEL = strmm8x4V.S DTRMMKERNEL = trmm8x4V.S diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 6394be769..3506c4e9b 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - inc_x2 = 2 * inc_x; maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 936c300c8..726747b99 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -216,53 +216,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index 2176f3dcd..fe5568cc8 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -110,7 +110,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -118,7 +118,7 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c new file mode 100644 index 000000000..4c3253774 --- /dev/null +++ b/kernel/zarch/cgemv_n_4.c @@ -0,0 +1,743 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include +#include +#include "common.h" + +#define NBMAX 1024 + +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" +#else + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" + + "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" +#else + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vlrepg %%v16,0(%2) \n\t" +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" +#else + "vlef %%v17,0(%2),1 \n\t" + "vlef %%v17,0(%2),3 \n\t" + "vflcsb %%v17,%%v17 \n\t" + "vlef %%v17,4(%2),0 \n\t" + "vlef %%v17,4(%2),2 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%%r0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,0b \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); +} + +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" + "vflcsb %%v1,%%v1 \n\t" + "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,3 \n\t" +#else + "vlef %%v0,%3,1 \n\t" + "vlef %%v0,%3,3 \n\t" + "vflcsb %%v0,%%v0 \n\t" + "vlef %%v0,%3,0 \n\t" + "vlef %%v0,%3,2 \n\t" + "vlrepf %%v1,%4 \n\t" +#endif + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,0(%%r1,%2) \n\t" + "vl %%v19,16(%%r1,%2) \n\t" + "verllg %%v20,%%v16,32 \n\t" + "verllg %%v21,%%v17,32 \n\t" + + "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" + + "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" + + "vst %%v22,0(%%r1,%2) \n\t" + "vst %%v23,16(%%r1,%2) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" + ); +} + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; + + if ( inc_dest != 2 ) + { + + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); + + x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; + + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; +#endif + +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + + return(0); + } + + + if ( m3 == 1 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } + + while ( j < n) + { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; +#else + + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; +#endif + +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; +#else + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; +#endif + + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + return(0); +} diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index e7f096e0d..9b4077c6b 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index b9c1ccd9c..6e952a325 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -281,6 +281,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -296,9 +302,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index aba880949..d1f135369 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3213efa4d..679606a8f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -204,6 +204,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -216,7 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 26fff4eb0..5de41ac7b 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 570b33a15..7fec111cf 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -180,6 +180,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -192,7 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 95a665b10..d2686c0cd 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) > maxf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) > maxf) { diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 640fc02c9..768f31a8c 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -247,6 +247,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = ABS(x[0]); + i++; + } while (i < n) { if (ABS(x[i]) < minf) { @@ -259,7 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = ABS(x[0]); + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (ABS(x[i]) < minf) { diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0eb350315..8fc32adf6 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + maxf = x[0]; + i++; + } while (i < n) { if (x[i] > maxf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + maxf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] > maxf) { diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index f050db8cb..415052810 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -223,6 +223,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i = n1; } + else + { + minf = x[0]; + i++; + } while (i < n) { if (x[i] < minf) { @@ -235,7 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - BLASLONG n1 = n & -4; + minf = x[0]; + i += inc_x; + j++; + + BLASLONG n1 = (n - 1) & -4; while (j < n1) { if (x[i] < minf) { diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index bf5f621a7..541464b05 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + maxf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - maxf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3636e8fdf..4b5572b80 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -202,6 +202,12 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) i = n1; } + else + { + minf = CABS1(x,0); + ix += 2; + i++; + } while(i < n) { @@ -217,9 +223,8 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - inc_x2 = 2 * inc_x; - minf = CABS1(x,0); + inc_x2 = 2 * inc_x; ix += inc_x2; i++; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 6393b099b..937bc9753 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -150,7 +150,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT maxf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { maxf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) > maxf) { - maxf = ABS(x[i*2]); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } + ix += 2; i++; } return (maxf); } else { - - inc_x2 = 2 * inc_x; + maxf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) > maxf) { - maxf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) > maxf) { - maxf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) > maxf) { - maxf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) > maxf) { - maxf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (maxf); } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index b15774bb9..8564edaf4 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -150,7 +150,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { BLASLONG i = 0; - BLASLONG j = 0; + BLASLONG ix = 0; FLOAT minf = 0.0; BLASLONG inc_x2; @@ -168,53 +168,55 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { else { minf=CABS1(x,0); + ix += 2; i++; } while (i < n) { - if (ABS(x[i*2]) < minf) { - minf = ABS(x[i*2]); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } + ix += 2; i++; } return (minf); } else { - inc_x2 = 2 * inc_x; minf=CABS1(x,0); - i += inc_x2; - j++; + inc_x2 = 2 * inc_x; + ix += inc_x2; + i++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while (i < n1) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - if (CABS1(x,i+inc_x2) < minf) { - minf = CABS1(x,i+inc_x2); + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); } - if (CABS1(x,i+inc_x2*2) < minf) { - minf = CABS1(x,i+inc_x2*2); + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); } - if (CABS1(x,i+inc_x2*3) < minf) { - minf = CABS1(x,i+inc_x2*3); + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); } - i += inc_x2 * 4; + ix += inc_x2 * 4; - j += 4; + i += 4; } - while (j < n) { - if (CABS1(x,i) < minf) { - minf = CABS1(x,i); + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); } - i += inc_x2; - j++; + ix += inc_x2; + i++; } return (minf); } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 6ba44a27c..f0e993d2f 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -106,7 +106,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "agfi %%r1,128 \n\t" "brctg %%r0,0b " : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"a"(alpha) + :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); } @@ -114,7 +114,7 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { BLASLONG i = 0; BLASLONG ix = 0, iy = 0; - FLOAT da[2]; + FLOAT da[2] __attribute__ ((aligned(16))); if (n <= 0) return (0); diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 484db3073..9472b5d5a 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2018, The OpenBLAS Project +Copyright (c) 2014, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,898 +23,693 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ +*****************************************************************************/ #include #include #include "common.h" -#define HAVE_KERNEL_4x4_VEC 1 -#define HAVE_KERNEL_4x2_VEC 1 -#define HAVE_KERNEL_4x1_VEC 1 -#define HAVE_KERNEL_ADDY 1 - -#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) -#include -#endif - -// #define NBMAX 1024 -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%5) \n\t" + "vl %%v17,16(%5) \n\t" + "vl %%v18,32(%5) \n\t" + "vl %%v19,48(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - register __vector double vx2_r = {x[4], x[4]}; - register __vector double vx2_i = {-x[5], x[5]}; - register __vector double vx3_r = {x[6], x[6]}; - register __vector double vx3_i = {-x[7], x[7]}; - + "vleg %%v20,8(%5),0 \n\t" + "wflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),1 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "wflcdb %%v22,%%v22 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vleg %%v23,56(%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,48(%5),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; - register __vector double vx2_r = {x[4], -x[4]}; - register __vector double vx2_i = {x[5], x[5]}; - register __vector double vx3_r = {x[6], -x[6]}; - register __vector double vx3_i = {x[7], x[7]}; + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,8(%5),0 \n\t" + "vleg %%v21,16(%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,24(%5),0 \n\t" + "vleg %%v22,32(%5),1 \n\t" + "vflcdb %%v22,%%v22 \n\t" + "vleg %%v22,40(%5),0 \n\t" + "vleg %%v23,48(%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,56(%5),0 \n\t" #endif - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - register __vector double *vptr_a2 = (__vector double *) a2; - register __vector double *vptr_a3 = (__vector double *) a3; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - register __vector double va2 = vptr_a2[i]; - register __vector double va2_1 = vptr_a2[i + 1]; - register __vector double va2_2 = vptr_a2[i + 2]; - register __vector double va2_3 = vptr_a2[i + 3]; - - register __vector double va3 = vptr_a3[i]; - register __vector double va3_1 = vptr_a3[i + 1]; - register __vector double va3_2 = vptr_a3[i + 2]; - register __vector double va3_3 = vptr_a3[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va2*vx2_r; - vy_1 += va2_1*vx2_r; - vy_2 += va2_2*vx2_r; - vy_3 += va2_3*vx2_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va3*vx3_r; - vy_1 += va3_1*vx3_r; - vy_2 += va3_2*vx3_r; - vy_3 += va3_3*vx3_r; - - va2 = vec_permi(va2, va2, 2); - va2_1 = vec_permi(va2_1, va2_1, 2); - va2_2 = vec_permi(va2_2, va2_2, 2); - va2_3 = vec_permi(va2_3, va2_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - va3 = vec_permi(va3, va3, 2); - va3_1 = vec_permi(va3_1, va3_1, 2); - va3_2 = vec_permi(va3_2, va3_2, 2); - va3_3 = vec_permi(va3_3, va3_3, 2); - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy_0 += va2*vx2_i; - vy_1 += va2_1*vx2_i; - vy_2 += va2_2*vx2_i; - vy_3 += va2_3*vx2_i; - - vy_0 += va3*vx3_i; - vy_1 += va3_1*vx3_i; - vy_2 += va3_2*vx3_i; - vy_3 += va3_3*vx3_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,0(%%r1,%6) \n\t" + + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%6) \n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" + "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" + "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" + "vst %%v0,16(%%r1,%6) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#else -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - for (i = 0; i < 2 * n; i += 2) { +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%3) \n\t" + "vl %%v17,16(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; - y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; - y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; - y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; - y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; + "vleg %%v18,8(%3),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%3),1 \n\t" + "vleg %%v19,24(%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%3),1 \n\t" +#else + "vleg %%v18,0(%3),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%3),0 \n\t" + "vleg %%v19,16(%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%3),0 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vl %%v0,0(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,0(%%r1,%4) \n\t" + + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vl %%v0,16(%%r1,%4) \n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" + "vst %%v0,16(%%r1,%4) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vl %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; - register __vector double vx1_r = {x[2], x[2]}; - register __vector double vx1_i = {-x[3], x[3]}; - + "vleg %%v17,8(%2),0 \n\t" + "wflcdb %%v17,%%v17 \n\t" + "vleg %%v17,0(%2),1 \n\t" #else - register __vector double vx0_r = {x[0], -x[0]}; - register __vector double vx0_i = {x[1], x[1]}; - register __vector double vx1_r = {x[2], -x[2]}; - register __vector double vx1_i = {x[3], x[3]}; + "vleg %%v17,0(%2),1 \n\t" + "vflcdb %%v17,%%v17 \n\t" + "vleg %%v17,8(%2),0 \n\t" #endif - - - register __vector double *vy = (__vector double *) y; - register __vector double *vptr_a0 = (__vector double *) a0; - register __vector double *vptr_a1 = (__vector double *) a1; - - for (i = 0; i < n; i += 4) { - - register __vector double vy_0 = vy[i]; - register __vector double vy_1 = vy[i + 1]; - register __vector double vy_2 = vy[i + 2]; - register __vector double vy_3 = vy[i + 3]; - - register __vector double va0 = vptr_a0[i]; - register __vector double va0_1 = vptr_a0[i + 1]; - register __vector double va0_2 = vptr_a0[i + 2]; - register __vector double va0_3 = vptr_a0[i + 3]; - - register __vector double va1 = vptr_a1[i]; - register __vector double va1_1 = vptr_a1[i + 1]; - register __vector double va1_2 = vptr_a1[i + 2]; - register __vector double va1_3 = vptr_a1[i + 3]; - - vy_0 += va0*vx0_r; - vy_1 += va0_1*vx0_r; - vy_2 += va0_2*vx0_r; - vy_3 += va0_3*vx0_r; - - va0 = vec_permi(va0, va0, 2); - va0_1 = vec_permi(va0_1, va0_1, 2); - va0_2 = vec_permi(va0_2, va0_2, 2); - va0_3 = vec_permi(va0_3, va0_3, 2); - - vy_0 += va1*vx1_r; - vy_1 += va1_1*vx1_r; - vy_2 += va1_2*vx1_r; - vy_3 += va1_3*vx1_r; - - va1 = vec_permi(va1, va1, 2); - va1_1 = vec_permi(va1_1, va1_1, 2); - va1_2 = vec_permi(va1_2, va1_2, 2); - va1_3 = vec_permi(va1_3, va1_3, 2); - - vy_0 += va0*vx0_i; - vy_1 += va0_1*vx0_i; - vy_2 += va0_2*vx0_i; - vy_3 += va0_3*vx0_i; - - vy_0 += va1*vx1_i; - vy_1 += va1_1*vx1_i; - vy_2 += va1_2*vx1_i; - vy_3 += va1_3*vx1_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - vy[i + 2] = vy_2; - vy[i + 3] = vy_3; - - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vlrepg %%v18,0(%%r1,%1) \n\t" + "vlrepg %%v19,8(%%r1,%1) \n\t" + + "vl %%v0,0(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,0(%%r1,%3) \n\t" + + "vlrepg %%v18,16(%%r1,%1) \n\t" + "vlrepg %%v19,24(%%r1,%1) \n\t" + + "vl %%v0,16(%%r1,%3) \n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" + "vst %%v0,16(%%r1,%3) \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19" + ); } -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - for (i = 0; i < 2 * n; i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; -#else - y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; - y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; - y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; - y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) +{ + __asm__ volatile ( +#if !defined(XCONJ) + "vlrepg %%v0,%3 \n\t" + "vleg %%v1,%4,0 \n\t" + "wflcdb %%v1,%%v1 \n\t" + "vleg %%v1,%4,1 \n\t" +#else + "vleg %%v0,%3,1 \n\t" + "vflcdb %%v0,%%v0 \n\t" + "vleg %%v0,%3,0 \n\t" + "vlrepg %%v1,%4 \n\t" #endif - } + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,2 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 2,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%1) \n\t" + "vl %%v17,16(%%r1,%1) \n\t" + "vl %%v18,32(%%r1,%1) \n\t" + "vl %%v19,48(%%r1,%1) \n\t" + "vl %%v20,0(%%r1,%2) \n\t" + "vl %%v21,16(%%r1,%2) \n\t" + "vl %%v22,32(%%r1,%2) \n\t" + "vl %%v23,48(%%r1,%2) \n\t" + "vpdi %%v24,%%v16,%%v16,4 \n\t" + "vpdi %%v25,%%v17,%%v17,4 \n\t" + "vpdi %%v26,%%v18,%%v18,4 \n\t" + "vpdi %%v27,%%v19,%%v19,4 \n\t" + + "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" + + "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" + + "vst %%v28,0(%%r1,%2) \n\t" + "vst %%v29,16(%%r1,%2) \n\t" + "vst %%v30,32(%%r1,%2) \n\t" + "vst %%v31,48(%%r1,%2) \n\t" + + "agfi %%r1,64 \n\t" + "brctg %%r0,0b " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - BLASLONG i; - FLOAT *a0; - a0 = ap; - - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) +{ + BLASLONG i; - register __vector double vx0_r = {x[0], x[0]}; - register __vector double vx0_i = {-x[1], x[1]}; + if ( inc_dest != 2 ) + { + FLOAT temp_r; + FLOAT temp_i; + for ( i=0; i -#endif - -#ifdef HAVE_KERNEL_4x4_VEC_ASM - -#elif HAVE_KERNEL_4x4_VEC - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - register __vector double vtemp2_p = {0.0, 0.0}; - register __vector double vtemp2_r = {0.0, 0.0}; - register __vector double vtemp3_p = {0.0, 0.0}; - register __vector double vtemp3_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { -// __builtin_prefetch(&x[i]); -// __builtin_prefetch(&a0[i]); -// __builtin_prefetch(&a1[i]); -// __builtin_prefetch(&a2[i]); -// __builtin_prefetch(&a3[i]); - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double va2 = *(__vector double*) (&a2[i]); - register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); - register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); - register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); - - register __vector double va3 = *(__vector double*) (&a3[i]); - register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); - register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); - register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vtemp2_p += vx_0*va2; - vtemp2_r += vxr_0*va2; - - vtemp3_p += vx_0*va3; - vtemp3_r += vxr_0*va3; - - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp2_p += vx_1*va2_1; - vtemp2_r += vxr_1*va2_1; - - vtemp3_p += vx_1*va3_1; - vtemp3_r += vxr_1*va3_1; - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp2_p += vx_2*va2_2; - vtemp2_r += vxr_0*va2_2; - - vtemp3_p += vx_2*va3_2; - vtemp3_r += vxr_0*va3_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - vtemp2_p += vx_3*va2_3; - vtemp2_r += vxr_1*va2_3; - - vtemp3_p += vx_3*va3_3; - vtemp3_r += vxr_1*va3_3; - - } +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v20,0(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vleg %%v21,8(%%r1,%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,0(%%r1,%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v21,0(%%r1,%5),1 \n\t" + "vflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%%r1,%5),0 \n\t" #endif -} -#else - -static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_r3 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_i2 = 0.0; - FLOAT temp_i3 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v24,0(%%r1,%1) \n\t" + "vlrepg %%v25,8(%%r1,%1) \n\t" + "vlrepg %%v26,0(%%r1,%2) \n\t" + "vlrepg %%v27,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v20,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v21,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v20,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v21,%%v17 \n\t" + + "vlrepg %%v28,0(%%r1,%3) \n\t" + "vlrepg %%v29,8(%%r1,%3) \n\t" + "vlrepg %%v30,0(%%r1,%4) \n\t" + "vlrepg %%v31,8(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v20,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v21,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v20,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v21,%%v19 \n\t" + + "vl %%v22,16(%%r1,%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; + "vleg %%v23,24(%%r1,%5),0 \n\t" + "wflcdb %%v23,%%v23 \n\t" + "vleg %%v23,16(%%r1,%5),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; - temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; - temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; - temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; - temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; + "vleg %%v23,16(%%r1,%5),1 \n\t" + "vflcdb %%v23,%%v23 \n\t" + "vleg %%v23,24(%%r1,%5),0 \n\t" #endif - } + "vlrepg %%v24,16(%%r1,%1) \n\t" + "vlrepg %%v25,24(%%r1,%1) \n\t" + "vlrepg %%v26,16(%%r1,%2) \n\t" + "vlrepg %%v27,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v24,%%v22,%%v16 \n\t" + "vfmadb %%v16,%%v25,%%v23,%%v16 \n\t" + "vfmadb %%v17,%%v26,%%v22,%%v17 \n\t" + "vfmadb %%v17,%%v27,%%v23,%%v17 \n\t" + + "vlrepg %%v28,16(%%r1,%3) \n\t" + "vlrepg %%v29,24(%%r1,%3) \n\t" + "vlrepg %%v30,16(%%r1,%4) \n\t" + "vlrepg %%v31,24(%%r1,%4) \n\t" + + "vfmadb %%v18,%%v28,%%v22,%%v18 \n\t" + "vfmadb %%v18,%%v29,%%v23,%%v18 \n\t" + "vfmadb %%v19,%%v30,%%v22,%%v19 \n\t" + "vfmadb %%v19,%%v31,%%v23,%%v19 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - + "vlrepg %%v24,0(%7) \n\t" + "vleg %%v25,8(%7),0 \n\t" + "wflcdb %%v25,%%v25 \n\t" + "vleg %%v25,8(%7),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - + "vleg %%v24,0(%7),1 \n\t" + "vflcdb %%v24,%%v24 \n\t" + "vleg %%v24,0(%7),0 \n\t" + "vlrepg %%v25,8(%7) \n\t" #endif + "vl %%v26,0(%6) \n\t" + "vl %%v27,16(%6) \n\t" + "vl %%v28,32(%6) \n\t" + "vl %%v29,48(%6) \n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26 \n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26 \n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27 \n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27 \n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28 \n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28 \n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29 \n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29 \n\t" + "vst %%v26,0(%6) \n\t" + "vst %%v27,16(%6) \n\t" + "vst %%v28,32(%6) \n\t" + "vst %%v29,48(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[8])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); } -#endif - -#ifdef HAVE_KERNEL_4x2_VEC - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - register __vector double vtemp1_p = {0.0, 0.0}; - register __vector double vtemp1_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double va1 = *(__vector double*) (&a1[i]); - register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); - register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); - register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vtemp1_p += vx_0*va1; - vtemp1_r += vxr_0*va1; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vtemp1_p += vx_1*va1_1; - vtemp1_r += vxr_1*va1_1; - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp1_p += vx_2*va1_2; - vtemp1_r += vxr_0*va1_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - vtemp1_p += vx_3*va1_3; - vtemp1_r += vxr_1*va1_3; - - } - +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "vzero %%v17 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v18,0(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vleg %%v19,8(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,0(%%r1,%3),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v19,0(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%%r1,%3),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - FLOAT temp_r0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_i1 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vlrepg %%v20,0(%%r1,%1) \n\t" + "vlrepg %%v21,8(%%r1,%1) \n\t" + "vlrepg %%v22,0(%%r1,%2) \n\t" + "vlrepg %%v23,8(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "vl %%v18,16(%%r1,%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + "vleg %%v19,24(%%r1,%3),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,16(%%r1,%3),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; - temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; - temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + "vleg %%v19,16(%%r1,%3),1 \n\t" + "vflcdb %%v19,%%v19 \n\t" + "vleg %%v19,24(%%r1,%3),0 \n\t" #endif - } + "vlrepg %%v20,16(%%r1,%1) \n\t" + "vlrepg %%v21,24(%%r1,%1) \n\t" + "vlrepg %%v22,16(%%r1,%2) \n\t" + "vlrepg %%v23,24(%%r1,%2) \n\t" + + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" + "vfmadb %%v16,%%v21,%%v19,%%v16 \n\t" + "vfmadb %%v17,%%v22,%%v18,%%v17 \n\t" + "vfmadb %%v17,%%v23,%%v19,%%v17 \n\t" + + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + + "vpdi %%v18,%%v16,%%v16,4 \n\t" + "vpdi %%v19,%%v17,%%v17,4 \n\t" #if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - + "vlrepg %%v20,0(%5) \n\t" + "vleg %%v21,8(%5),0 \n\t" + "wflcdb %%v21,%%v21 \n\t" + "vleg %%v21,8(%5),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - + "vleg %%v20,0(%5),1 \n\t" + "vflcdb %%v20,%%v20 \n\t" + "vleg %%v20,0(%5),0 \n\t" + "vlrepg %%v21,8(%5) \n\t" #endif + "vl %%v22,0(%4) \n\t" + "vl %%v23,16(%4) \n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22 \n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22 \n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23 \n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23 \n\t" + "vst %%v22,0(%4) \n\t" + "vst %%v23,16(%4) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[4])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20","v21","v22","v23" + ); } -#endif - -#ifdef HAVE_KERNEL_4x1_VEC - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0 ; - a0 = ap; - //p for positive(real*real,image*image) r for image (real*image,image*real) - register __vector double vtemp0_p = {0.0, 0.0}; - register __vector double vtemp0_r = {0.0, 0.0}; - i = 0; - n = n << 1; - while (i < n) { - - register __vector double vx_0 = *(__vector double*) (&x[i]); - register __vector double vx_1 = *(__vector double*) (&x[i + 2]); - register __vector double vx_2 = *(__vector double*) (&x[i + 4]); - register __vector double vx_3 = *(__vector double*) (&x[i + 6]); - - register __vector double va0 = *(__vector double*) (&a0[i]); - register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); - register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); - register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); - - register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); - register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); - - i += 8; - - vtemp0_p += vx_0*va0; - vtemp0_r += vxr_0*va0; - - vxr_0 = vec_permi(vx_2, vx_2, 2); - vtemp0_p += vx_1*va0_1; - vtemp0_r += vxr_1*va0_1; - - vxr_1 = vec_permi(vx_3, vx_3, 2); - - vtemp0_p += vx_2*va0_2; - vtemp0_r += vxr_0*va0_2; - - vtemp0_p += vx_3*va0_3; - vtemp0_r += vxr_1*va0_3; - - } - +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + __asm__ volatile ( + "vzero %%v16 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v17,0(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - + "vleg %%v18,8(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%%r1,%2),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + "vleg %%v18,0(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,8(%%r1,%2),0 \n\t" #endif -} - -#else - -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0; - a0 = ap; + "vlrepg %%v19,0(%%r1,%1) \n\t" + "vlrepg %%v20,8(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - - for (i = 0; i < 2 * n; i += 2) { + "vl %%v17,16(%%r1,%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + "vleg %%v18,24(%%r1,%2),0 \n\t" + "wflcdb %%v18,%%v18 \n\t" + "vleg %%v18,16(%%r1,%2),1 \n\t" #else - temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; - temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + "vleg %%v18,16(%%r1,%2),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,24(%%r1,%2),0 \n\t" #endif - } -#if !defined(XCONJ) + "vlrepg %%v19,16(%%r1,%1) \n\t" + "vlrepg %%v20,24(%%r1,%1) \n\t" + + "vfmadb %%v16,%%v19,%%v17,%%v16 \n\t" + "vfmadb %%v16,%%v20,%%v18,%%v16 \n\t" - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + "agfi %%r1,32 \n\t" + "brctg %%r0,0b \n\t" + "vpdi %%v17,%%v16,%%v16,4 \n\t" +#if !defined(XCONJ) + "vlrepg %%v18,0(%4) \n\t" + "vleg %%v19,8(%4),0 \n\t" + "wflcdb %%v19,%%v19 \n\t" + "vleg %%v19,8(%4),1 \n\t" #else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - + "vleg %%v18,0(%4),1 \n\t" + "vflcdb %%v18,%%v18 \n\t" + "vleg %%v18,0(%4),0 \n\t" + "vlrepg %%v19,8(%4) \n\t" #endif - + "vl %%v20,0(%3) \n\t" + "vfmadb %%v20,%%v16,%%v18,%%v20 \n\t" + "vfmadb %%v20,%%v17,%%v19,%%v20 \n\t" + "vst %%v20,0(%3) \n\t" + : + :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZR"((const FLOAT (*)[n * 2])x),"ZQ"((FLOAT (*)[2])y),"ZQ"((const FLOAT (*)[2])alpha) + :"memory","cc","r0","r1","v16","v17","v18","v19","v20" + ); } -#endif - -static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8],*xbuffer; + FLOAT alpha[2]; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m - m3; + m2 = (m & (NBMAX-1)) - m3 ; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if ( inc_x != 2 ) + copy_x(NB,x_ptr,xbuffer,inc_x); + else + xbuffer = x_ptr; + + if ( inc_y == 2 ) + { + + for( i = 0; i < n1 ; i++) + { + zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if ( n2 & 2 ) + { + zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if ( n2 & 1 ) + { + zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } + else + { + + for( i = 0; i < n1 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for( i = 0; i < n2 ; i++) + { + memset(ybuffer,0,sizeof(ybuffer)); + zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + + + if ( m3 == 0 ) return(0); - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; x_ptr = x; + j=0; + a_ptr = a; + y_ptr = y; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { + if ( m3 == 3 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + + + if ( m3 == 2 ) + { + + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return (0); - } + return(0); + } - if (m3 == 1) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; + if ( m3 == 1 ) + { - while (j < (n & -2)) { + FLOAT temp_r ; + FLOAT temp_i ; + FLOAT temp_r1 ; + FLOAT temp_i1 ; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while ( j < ( n & -2 )) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j+=2; + } - while (j < n) { + while ( j < n) + { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif #if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return(0); + } + return(0); } - diff --git a/ztest/gemv.c b/ztest/gemv.c index f1ee972bc..964afd3ef 100644 --- a/ztest/gemv.c +++ b/ztest/gemv.c @@ -52,67 +52,66 @@ int assert_dbl_near(double exp, double real, double tol) { int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { BLASLONG i; - BLASLONG ix,iy; + BLASLONG ix, iy; BLASLONG j; FLOAT *a_ptr; - FLOAT temp_r,temp_i; - BLASLONG inc_x2,inc_y2; + FLOAT temp_r, temp_i; + BLASLONG inc_x2, inc_y2; BLASLONG lda2; BLASLONG i2; - lda2 = 2*lda; + lda2 = 2 * lda; ix = 0; a_ptr = a; - if ( inc_x == 1 && inc_y == 1 ) + if (inc_x == 1 && inc_y == 1) { - for (j=0; j Date: Fri, 4 Jan 2019 01:38:18 +0200 Subject: [PATCH 004/133] [ZARCH] fix sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index efc06297f..fe99ef5ce 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,8 +158,6 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" "vrepf %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "vrepf %%v4,%%v0,2 \n\t" @@ -351,6 +349,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" "1: \n\t" "lghi %%r0,28 \n\t" From 94cd946b963e9e077cb4a4c5d93b1ce691e1fe63 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 4 Jan 2019 17:45:56 +0200 Subject: [PATCH 005/133] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 332 +++++++++++++++++++-------------------- 1 file changed, 166 insertions(+), 166 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 4c3253774..c939aea9f 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -34,107 +34,107 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" + "vlrepg %%v16,0(%5) \n\t" + "vlrepg %%v17,8(%5) \n\t" + "vlrepg %%v18,16(%5) \n\t" + "vlrepg %%v19,24(%5) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%5),1 \n\t" + "vlef %%v20,0(%5),3 \n\t" + "vflcsb %%v20,%%v20 \n\t" + "vlef %%v20,4(%5),0 \n\t" + "vlef %%v20,4(%5),2 \n\t" + + "vlef %%v21,8(%5),1 \n\t" + "vlef %%v21,8(%5),3 \n\t" + "vflcsb %%v21,%%v21 \n\t" + "vlef %%v21,12(%5),0 \n\t" + "vlef %%v21,12(%5),2 \n\t" + + "vlef %%v22,16(%5),1 \n\t" + "vlef %%v22,16(%5),3 \n\t" + "vflcsb %%v22,%%v22 \n\t" + "vlef %%v22,20(%5),0 \n\t" + "vlef %%v22,20(%5),2 \n\t" + + "vlef %%v23,24(%5),1 \n\t" + "vlef %%v23,24(%5),3 \n\t" + "vflcsb %%v23,%%v23 \n\t" + "vlef %%v23,28(%5),0 \n\t" + "vlef %%v23,28(%5),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 2,1024(%%r1,%6) \n\t" + + "vlef %%v24,0(%%r1,%1),0 \n\t" + "vlef %%v24,0(%%r1,%1),1 \n\t" + "vlef %%v24,8(%%r1,%1),2 \n\t" + "vlef %%v24,8(%%r1,%1),3 \n\t" + "vlef %%v25,4(%%r1,%1),0 \n\t" + "vlef %%v25,4(%%r1,%1),1 \n\t" + "vlef %%v25,12(%%r1,%1),2 \n\t" + "vlef %%v25,12(%%r1,%1),3 \n\t" + "vlef %%v26,0(%%r1,%2),0 \n\t" + "vlef %%v26,0(%%r1,%2),1 \n\t" + "vlef %%v26,8(%%r1,%2),2 \n\t" + "vlef %%v26,8(%%r1,%2),3 \n\t" + "vlef %%v27,4(%%r1,%2),0 \n\t" + "vlef %%v27,4(%%r1,%2),1 \n\t" + "vlef %%v27,12(%%r1,%2),2 \n\t" + "vlef %%v27,12(%%r1,%2),3 \n\t" + + "vl %%v0,0(%%r1,%6) \n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" + + "vlef %%v28,0(%%r1,%1),0 \n\t" + "vlef %%v28,0(%%r1,%1),1 \n\t" + "vlef %%v28,8(%%r1,%1),2 \n\t" + "vlef %%v28,8(%%r1,%1),3 \n\t" + "vlef %%v29,4(%%r1,%1),0 \n\t" + "vlef %%v29,4(%%r1,%1),1 \n\t" + "vlef %%v29,12(%%r1,%1),2 \n\t" + "vlef %%v29,12(%%r1,%1),3 \n\t" + "vlef %%v30,0(%%r1,%2),0 \n\t" + "vlef %%v30,0(%%r1,%2),1 \n\t" + "vlef %%v30,8(%%r1,%2),2 \n\t" + "vlef %%v30,8(%%r1,%2),3 \n\t" + "vlef %%v31,4(%%r1,%2),0 \n\t" + "vlef %%v31,4(%%r1,%2),1 \n\t" + "vlef %%v31,12(%%r1,%2),2 \n\t" + "vlef %%v31,12(%%r1,%2),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" @@ -153,56 +153,56 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" + "vlrepg %%v16,0(%3) \n\t" + "vlrepg %%v17,8(%3) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%3),1 \n\t" + "vlef %%v18,0(%3),3 \n\t" + "vflcsb %%v18,%%v18 \n\t" + "vlef %%v18,4(%3),0 \n\t" + "vlef %%v18,4(%3),2 \n\t" + + "vlef %%v19,8(%3),1 \n\t" + "vlef %%v19,8(%3),3 \n\t" + "vflcsb %%v19,%%v19 \n\t" + "vlef %%v19,12(%3),0 \n\t" + "vlef %%v19,12(%3),2 \n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" + "xgr %%r1,%%r1 \n\t" + "srlg %%r0,%0,1 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%4) \n\t" + + "vlef %%v20,0(%%r1,%1),0 \n\t" + "vlef %%v20,0(%%r1,%1),1 \n\t" + "vlef %%v20,8(%%r1,%1),2 \n\t" + "vlef %%v20,8(%%r1,%1),3 \n\t" + "vlef %%v21,4(%%r1,%1),0 \n\t" + "vlef %%v21,4(%%r1,%1),1 \n\t" + "vlef %%v21,12(%%r1,%1),2 \n\t" + "vlef %%v21,12(%%r1,%1),3 \n\t" + "vlef %%v22,0(%%r1,%2),0 \n\t" + "vlef %%v22,0(%%r1,%2),1 \n\t" + "vlef %%v22,8(%%r1,%2),2 \n\t" + "vlef %%v22,8(%%r1,%2),3 \n\t" + "vlef %%v23,4(%%r1,%2),0 \n\t" + "vlef %%v23,4(%%r1,%2),1 \n\t" + "vlef %%v23,12(%%r1,%2),2 \n\t" + "vlef %%v23,12(%%r1,%2),3 \n\t" "vl %%v0,0(%%r1,%4) \n\t" "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" @@ -222,34 +222,34 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" + "vlrepg %%v16,0(%2) \n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" #else "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,0(%2),3 \n\t" "vflcsb %%v17,%%v17 \n\t" "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,4(%2),2 \n\t" #endif "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%%r0,1 \n\t" + "srlg %%r0,%0,1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%1) \n\t" "pfd 2,1024(%%r1,%3) \n\t" - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" + "vlef %%v18,0(%%r1,%1),0 \n\t" + "vlef %%v18,0(%%r1,%1),1 \n\t" + "vlef %%v18,8(%%r1,%1),2 \n\t" + "vlef %%v18,8(%%r1,%1),3 \n\t" + "vlef %%v19,4(%%r1,%1),0 \n\t" + "vlef %%v19,4(%%r1,%1),1 \n\t" + "vlef %%v19,12(%%r1,%1),2 \n\t" + "vlef %%v19,12(%%r1,%1),3 \n\t" "vl %%v0,0(%%r1,%3) \n\t" "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" @@ -268,18 +268,18 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al { __asm__ volatile ( #if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" + "vlrepf %%v0,%3 \n\t" + "vlef %%v1,%4,0 \n\t" + "vlef %%v1,%4,2 \n\t" "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" + "vlef %%v1,%4,1 \n\t" "vlef %%v1,%4,3 \n\t" #else "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" + "vlef %%v0,%3,3 \n\t" "vflcsb %%v0,%%v0 \n\t" "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" + "vlef %%v0,%3,2 \n\t" "vlrepf %%v1,%4 \n\t" #endif "xgr %%r1,%%r1 \n\t" @@ -292,7 +292,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT al "vl %%v17,16(%%r1,%1) \n\t" "vl %%v18,0(%%r1,%2) \n\t" "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" + "verllg %%v20,%%v16,32 \n\t" "verllg %%v21,%%v17,32 \n\t" "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" From 3eafcfa6507891f7fff781423d9eb6af13501133 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 07:43:45 +0200 Subject: [PATCH 006/133] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index c939aea9f..7b5e43497 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -119,22 +119,22 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - "vlef %%v28,0(%%r1,%1),0 \n\t" - "vlef %%v28,0(%%r1,%1),1 \n\t" - "vlef %%v28,8(%%r1,%1),2 \n\t" - "vlef %%v28,8(%%r1,%1),3 \n\t" - "vlef %%v29,4(%%r1,%1),0 \n\t" - "vlef %%v29,4(%%r1,%1),1 \n\t" - "vlef %%v29,12(%%r1,%1),2 \n\t" - "vlef %%v29,12(%%r1,%1),3 \n\t" - "vlef %%v30,0(%%r1,%2),0 \n\t" - "vlef %%v30,0(%%r1,%2),1 \n\t" - "vlef %%v30,8(%%r1,%2),2 \n\t" - "vlef %%v30,8(%%r1,%2),3 \n\t" - "vlef %%v31,4(%%r1,%2),0 \n\t" - "vlef %%v31,4(%%r1,%2),1 \n\t" - "vlef %%v31,12(%%r1,%2),2 \n\t" - "vlef %%v31,12(%%r1,%2),3 \n\t" + "vlef %%v28,0(%%r1,%3),0 \n\t" + "vlef %%v28,0(%%r1,%3),1 \n\t" + "vlef %%v28,8(%%r1,%3),2 \n\t" + "vlef %%v28,8(%%r1,%3),3 \n\t" + "vlef %%v29,4(%%r1,%3),0 \n\t" + "vlef %%v29,4(%%r1,%3),1 \n\t" + "vlef %%v29,12(%%r1,%3),2 \n\t" + "vlef %%v29,12(%%r1,%3),3 \n\t" + "vlef %%v30,0(%%r1,%4),0 \n\t" + "vlef %%v30,0(%%r1,%4),1 \n\t" + "vlef %%v30,8(%%r1,%4),2 \n\t" + "vlef %%v30,8(%%r1,%4),3 \n\t" + "vlef %%v31,4(%%r1,%4),0 \n\t" + "vlef %%v31,4(%%r1,%4),1 \n\t" + "vlef %%v31,12(%%r1,%4),2 \n\t" + "vlef %%v31,12(%%r1,%4),3 \n\t" "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" From e7455f500c06ecda4085d560ffa20c5bc188416f Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:33:54 +0200 Subject: [PATCH 007/133] [ZARCH] fix dsdot.c --- kernel/zarch/dsdot.c | 123 ++++++++++++++++++++----------------------- 1 file changed, 56 insertions(+), 67 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 17461a029..800bb0d51 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -27,61 +27,34 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__ volatile ( "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" + "srlg %%r0,%1,4 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmsb %%v16,%%v16,%%v24 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmsb %%v17,%%v17,%%v25 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmsb %%v18,%%v18,%%v26 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmsb %%v19,%%v19,%%v27 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmsb %%v20,%%v20,%%v28 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmsb %%v21,%%v21,%%v29 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmsb %%v22,%%v22,%%v30 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmsb %%v23,%%v23,%%v31 \n\t" - - "vflls %%v24,%%v16 \n\t" - "vflls %%v25,%%v17 \n\t" - "vflls %%v26,%%v18 \n\t" - "vflls %%v27,%%v19 \n\t" - "vflls %%v28,%%v20 \n\t" - "vflls %%v29,%%v21 \n\t" - "vflls %%v30,%%v22 \n\t" - "vflls %%v31,%%v23 \n\t" - - "veslg %%v16,%%v16,32 \n\t" - "veslg %%v17,%%v17,32 \n\t" - "veslg %%v18,%%v18,32 \n\t" - "veslg %%v19,%%v19,32 \n\t" - "veslg %%v20,%%v20,32 \n\t" - "veslg %%v21,%%v21,32 \n\t" - "veslg %%v22,%%v22,32 \n\t" - "veslg %%v23,%%v23,32 \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vlef %%v16,0(%%r1,%2),0 \n\t" + "vlef %%v16,4(%%r1,%2),2 \n\t" + "vlef %%v17,8(%%r1,%2),0 \n\t" + "vlef %%v17,12(%%r1,%2),2 \n\t" + "vlef %%v18,16(%%r1,%2),0 \n\t" + "vlef %%v18,20(%%r1,%2),2 \n\t" + "vlef %%v19,24(%%r1,%2),0 \n\t" + "vlef %%v19,28(%%r1,%2),2 \n\t" + "vlef %%v20,32(%%r1,%2),0 \n\t" + "vlef %%v20,36(%%r1,%2),2 \n\t" + "vlef %%v21,40(%%r1,%2),0 \n\t" + "vlef %%v21,44(%%r1,%2),2 \n\t" + "vlef %%v22,48(%%r1,%2),0 \n\t" + "vlef %%v22,52(%%r1,%2),2 \n\t" + "vlef %%v23,56(%%r1,%2),0 \n\t" + "vlef %%v23,60(%%r1,%2),2 \n\t" "vflls %%v16,%%v16 \n\t" "vflls %%v17,%%v17 \n\t" @@ -92,24 +65,40 @@ static double dsdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "vflls %%v22,%%v22 \n\t" "vflls %%v23,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v24 \n\t" - "vfadb %%v17,%%v17,%%v25 \n\t" - "vfadb %%v18,%%v18,%%v26 \n\t" - "vfadb %%v19,%%v19,%%v27 \n\t" - "vfadb %%v20,%%v20,%%v28 \n\t" - "vfadb %%v21,%%v21,%%v29 \n\t" - "vfadb %%v22,%%v22,%%v30 \n\t" - "vfadb %%v23,%%v23,%%v31 \n\t" - "vfadb %%v16,%%v16,%%v20 \n\t" - "vfadb %%v17,%%v17,%%v21 \n\t" - "vfadb %%v18,%%v18,%%v22 \n\t" - "vfadb %%v19,%%v19,%%v23 \n\t" - "vfadb %%v16,%%v16,%%v18 \n\t" - "vfadb %%v17,%%v17,%%v19 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v0,%%v16,%%v0 \n\t" - - "agfi %%r1,128 \n\t" + "vlef %%v24,0(%%r1,%3),0 \n\t" + "vlef %%v24,4(%%r1,%3),2 \n\t" + "vflls %%v24,%%v24 \n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" + "vlef %%v25,8(%%r1,%3),0 \n\t" + "vlef %%v25,12(%%r1,%3),2 \n\t" + "vflls %%v25,%%v25 \n\t" + "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" + "vlef %%v26,16(%%r1,%3),0 \n\t" + "vlef %%v26,20(%%r1,%3),2 \n\t" + "vflls %%v26,%%v26 \n\t" + "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" + "vlef %%v27,24(%%r1,%3),0 \n\t" + "vlef %%v27,28(%%r1,%3),2 \n\t" + "vflls %%v27,%%v27 \n\t" + "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" + "vlef %%v28,32(%%r1,%3),0 \n\t" + "vlef %%v28,36(%%r1,%3),2 \n\t" + "vflls %%v28,%%v28 \n\t" + "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" + "vlef %%v29,40(%%r1,%3),0 \n\t" + "vlef %%v29,44(%%r1,%3),2 \n\t" + "vflls %%v29,%%v29 \n\t" + "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" + "vlef %%v30,48(%%r1,%3),0 \n\t" + "vlef %%v30,52(%%r1,%3),2 \n\t" + "vflls %%v30,%%v30 \n\t" + "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" + "vlef %%v31,56(%%r1,%3),0 \n\t" + "vlef %%v31,60(%%r1,%3),2 \n\t" + "vflls %%v31,%%v31 \n\t" + "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,64 \n\t" "brctg %%r0,0b \n\t" "vrepg %%v1,%%v0,1 \n\t" "adbr %%f0,%%f1 \n\t" @@ -134,10 +123,10 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -16; if ( n1 ) - dot = dsdot_kernel_32(n1,x,y); + dot = dsdot_kernel_16(n1,x,y); i = n1; while(i < n) From c2ffef81569624cc530d515bbaac9890d819253b Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:49:44 +0200 Subject: [PATCH 008/133] [ZARCH] fix data prefetch type in ddot --- kernel/zarch/ddot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f34d1e96e..ff4c347a6 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -37,7 +37,7 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From be66f5d5c21b558dd1ef35dc8f4bda6b544b4f79 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 9 Jan 2019 16:50:07 +0200 Subject: [PATCH 009/133] [ZARCH] fix data prefetch type in sdot --- kernel/zarch/sdot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index fd8c8e445..5ddbc69bd 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -37,7 +37,7 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%2) \n\t" "vl %%v17,16(%%r1,%2) \n\t" From 67432b23c2fe7f8ef29cf85821278dcdf69b4db2 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 16:44:46 +0200 Subject: [PATCH 010/133] [ZARCH] fix cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index 7b5e43497..a45c3d687 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -396,7 +396,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r,FLOAT alpha_i, ap[3] = ap[2] + lda; x_ptr = x; //zero_y(NB,ybuffer); - memset(ybuffer,0,NB*16); + memset(ybuffer,0,NB*8); if ( inc_x == 2 ) { From 5d89d6b143ea770e4dcb2336319b543f2297c6ba Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:08:24 +0200 Subject: [PATCH 011/133] [ZARCH] fix sgemv_n_4.c --- kernel/zarch/sgemv_n_4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 92019d732..01d8414de 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -435,7 +435,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO ap[3] = ap[2] + lda; if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); + memset(ybuffer,0,NB*4); else ybuffer = y_ptr; @@ -465,8 +465,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From ecc31b743fc93d3b5951e83e6e37148dbdd381c8 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:13:02 +0200 Subject: [PATCH 012/133] Update dgemv_t_4.c --- kernel/zarch/dgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index f9c1f966d..2d8fa0d10 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -601,9 +601,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From b731e8246f9fad13637005be39d8566111bab9fe Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:14:04 +0200 Subject: [PATCH 013/133] Update sgemv_t_4.c --- kernel/zarch/sgemv_t_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index fe99ef5ce..5515d7bb7 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -605,9 +605,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO { sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - a_ptr += lda; + // a_ptr += lda; *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; + // y_ptr += inc_y; } a += NB; From 621dedb37bd1d33c7006c305b4057bb0cc7ea7cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:37:11 +0200 Subject: [PATCH 014/133] [ZARCH] Update cgemv_t_4.c --- kernel/zarch/cgemv_t_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 89914fb1f..0dd43057c 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { From 406f835f00fedcfef894742b30a7f48905836eee Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:39:17 +0200 Subject: [PATCH 015/133] [ZARCH] update cgemv_n_4.c --- kernel/zarch/cgemv_n_4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index a45c3d687..ed81325e1 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "common.h" -#define NBMAX 1024 +#define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { From 1a7925b3a335114d26bd1d25d6f6fdc2743909b6 Mon Sep 17 00:00:00 2001 From: maamountki Date: Fri, 11 Jan 2019 17:43:11 +0200 Subject: [PATCH 016/133] [ZARCH] Update dgemv_n_4.c --- kernel/zarch/dgemv_n_4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca6d287bc..ca4fd6170 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -488,8 +488,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO if ( n2 & 1 ) { dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; + /* a_ptr += lda; + x_ptr += 1; */ } From b815a04c87e49a01e66e1c41ce4654f8d7817f83 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 15 Jan 2019 21:04:22 +0200 Subject: [PATCH 017/133] [ZARCH] fix a bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/idamax.c | 2 +- kernel/zarch/idamin.c | 2 +- kernel/zarch/idmax.c | 2 +- kernel/zarch/idmin.c | 2 +- kernel/zarch/isamax.c | 2 +- kernel/zarch/isamin.c | 2 +- kernel/zarch/ismax.c | 2 +- kernel/zarch/ismin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 20 files changed, 20 insertions(+), 20 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 3506c4e9b..2c913b62e 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 726747b99..733f98fbf 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -237,7 +237,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index b74af5d37..236d11c72 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 4cf5e88b1..c2c63c6c5 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -172,7 +172,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index de38bd21a..469f65735 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index d7c86735f..3df504950 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -148,7 +148,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index d1f135369..4f7ff6985 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 679606a8f..3abc7a558 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -226,7 +226,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 5de41ac7b..313a88db4 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 7fec111cf..42443215b 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -202,7 +202,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index d2686c0cd..dd2144db2 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 768f31a8c..d7e44421d 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -269,7 +269,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 8fc32adf6..1ebc6c8c8 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 415052810..a6b9d59de 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -245,7 +245,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 1025cfcbf..61d50159f 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 3b8f03e6a..a585a79ff 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -176,7 +176,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 33798eb7c..bcdb473af 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e882b7ff1..91c31d284 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { j++; BLASLONG n1 = (n - 1) & -4; - while (j < n1) { + while ((j - 1) < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 937bc9753..8ef3f42ca 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 8564edaf4..30fd1d030 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -189,7 +189,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { i++; BLASLONG n1 = (n - 1) & -4; - while (i < n1) { + while ((i - 1) < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From b70fd238366c6a822c7f1766ab125f64c67a6b39 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:18:54 +0200 Subject: [PATCH 019/133] disable NaN checks before BLAS calls dsolve.R --- benchmark/scripts/R/dsolve.R | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index a3fb78da7..6f1b8ef7b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,6 +2,10 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) { + options(matprod = "blas") +} + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +23,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,31 +30,23 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) - B <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) + B <- matrix(rnorm(n * n), nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) - mflops <- - (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } + From 2777a7f506308550e37f7ef26ce05f53a0d096ef Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:23:51 +0200 Subject: [PATCH 020/133] disable NaN checks before BLAS calls dsolve.R (shorter config part) --- benchmark/scripts/R/dsolve.R | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index 6f1b8ef7b..ad2045900 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,9 +2,7 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) { - options(matprod = "blas") -} +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") nfrom <- 128 nto <- 2048 @@ -42,11 +40,10 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e6) + mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep } - From 7af8b21dbbb523b0e9ab6caff271cb63affaa5f2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:34:46 +0200 Subject: [PATCH 021/133] disable NaN checks before BLAS calls dsolve.R (shorter formula) --- benchmark/scripts/R/dsolve.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index ad2045900..46301570b 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -40,7 +40,7 @@ while (n <= nto) { solve(A, B) }) - mflops <- (2.0/3 * n * n * n + 2 * n * n * n) * loops/ (z[3] * 1e+06) + mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) From 3afceb6c2a220ff61878c9a328846cc723de42ed Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:38:14 +0200 Subject: [PATCH 022/133] disable NaN checks before BLAS calls deig.R --- benchmark/scripts/R/deig.R | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index ece727fb3..32716471b 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,14 +28,7 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom @@ -45,11 +39,10 @@ while (n <= nto) { ev <- eigen(A) }) - mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 478d3c4569cd4957bbef779423ee7e51686b5c0a Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:41:46 +0200 Subject: [PATCH 023/133] disable NaN checks before BLAS calls deig.R (shorten matrix def) --- benchmark/scripts/R/deig.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index 32716471b..c6d541dcf 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -33,7 +33,7 @@ cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), ncol = n, nrow = n) + A <- matrix(rnorm(n * n), nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) From 3e601bd4195b24568eb4f7db2402ba3258fd82cc Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Wed, 16 Jan 2019 11:54:22 +0200 Subject: [PATCH 024/133] disable NaN checks before BLAS calls dgemm.R --- benchmark/scripts/R/dgemm.R | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R index 75297dfb8..d7c3e8108 100755 --- a/benchmark/scripts/R/dgemm.R +++ b/benchmark/scripts/R/dgemm.R @@ -2,6 +2,8 @@ argv <- commandArgs(trailingOnly = TRUE) +if (!is.null(options("matprod")[[1]])) options(matprod = "blas") + nfrom <- 128 nto <- 2048 nstep <- 128 @@ -19,7 +21,6 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } - } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -27,26 +28,13 @@ if (p != "") { loops <- as.numeric(p) } - -cat(sprintf( - "From %.0f To %.0f Step=%.0f Loops=%.0f\n", - nfrom, - nto, - nstep, - loops -)) +cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) - B <- matrix(runif(n * n), - ncol = n, - nrow = n, - byrow = TRUE) + A <- matrix(runif(n * n), nrow = n) + B <- matrix(runif(n * n), nrow = n) C <- 1 z <- system.time(for (l in 1:loops) { @@ -54,11 +42,10 @@ while (n <= nto) { l <- l + 1 }) - mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) + mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep - } From 3e9fd6359dabb1c9c8ce3fa5e980e94a3536d2c0 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 16:19:03 +0100 Subject: [PATCH 025/133] Bump xcode version to 10.1 to make sure it handles AVX512 --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 51679af62..ec5dc8a9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -149,7 +149,7 @@ matrix: - &test-macos os: osx - osx_image: xcode8.3 + osx_image: xcode10.1 before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" - brew update From d5e6940253b2ee638509de283b8b1d7695fefbbf Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 17 Jan 2019 23:20:32 +0100 Subject: [PATCH 026/133] Fix declaration of input arguments in the x86_64 microkernels for DOT and AXPY (#1965) * Tag operands 0 and 1 as both input and output For #1964 (basically a continuation of coding problems first seen in #1292) --- kernel/x86_64/caxpy_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/caxpy_microk_haswell-2.c | 6 +++--- kernel/x86_64/caxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/caxpy_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_bulldozer-2.c | 14 +++++++------- kernel/x86_64/cdot_microk_haswell-2.c | 6 +++--- kernel/x86_64/cdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/cdot_microk_steamroller-2.c | 14 +++++++------- kernel/x86_64/daxpy_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/daxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/daxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/daxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/daxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/ddot_microk_haswell-2.c | 6 +++--- kernel/x86_64/ddot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/ddot_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/ddot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/ddot_microk_steamroller-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/saxpy_microk_nehalem-2.c | 6 +++--- kernel/x86_64/saxpy_microk_piledriver-2.c | 16 ++++++++-------- kernel/x86_64/saxpy_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_bulldozer-2.c | 8 ++++---- kernel/x86_64/sdot_microk_haswell-2.c | 8 ++++---- kernel/x86_64/sdot_microk_nehalem-2.c | 8 ++++---- kernel/x86_64/sdot_microk_sandy-2.c | 8 ++++---- kernel/x86_64/sdot_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_haswell-2.c | 8 ++++---- kernel/x86_64/zaxpy_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zaxpy_microk_steamroller-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_bulldozer-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_haswell-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_sandy-2.c | 16 ++++++++-------- kernel/x86_64/zdot_microk_steamroller-2.c | 16 ++++++++-------- 37 files changed, 202 insertions(+), 202 deletions(-) diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c index 33bda0943..ca2209340 100644 --- a/kernel/x86_64/caxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c @@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c index 00e2e6a42..b605ea34c 100644 --- a/kernel/x86_64/caxpy_microk_haswell-2.c +++ b/kernel/x86_64/caxpy_microk_haswell-2.c @@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c index a798fd977..72d37afed 100644 --- a/kernel/x86_64/caxpy_microk_sandy-2.c +++ b/kernel/x86_64/caxpy_microk_sandy-2.c @@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c index 87370b032..7ca7af070 100644 --- a/kernel/x86_64/caxpy_microk_steamroller-2.c +++ b/kernel/x86_64/caxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c index f587aa036..118655913 100644 --- a/kernel/x86_64/cdot_microk_bulldozer-2.c +++ b/kernel/x86_64/cdot_microk_bulldozer-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c index fe195a63b..8b9d6d104 100644 --- a/kernel/x86_64/cdot_microk_haswell-2.c +++ b/kernel/x86_64/cdot_microk_haswell-2.c @@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c index 01816917d..fe142c38f 100644 --- a/kernel/x86_64/cdot_microk_sandy-2.c +++ b/kernel/x86_64/cdot_microk_sandy-2.c @@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c index 76a3aa0eb..7350b21c9 100644 --- a/kernel/x86_64/cdot_microk_steamroller-2.c +++ b/kernel/x86_64/cdot_microk_steamroller-2.c @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c index 8c520dcf1..9c1305b97 100644 --- a/kernel/x86_64/daxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c @@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c index bbe8b9550..f3682e6d7 100644 --- a/kernel/x86_64/daxpy_microk_haswell-2.c +++ b/kernel/x86_64/daxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c index 943d893af..8feb9f26c 100644 --- a/kernel/x86_64/daxpy_microk_nehalem-2.c +++ b/kernel/x86_64/daxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c index 95eb953b4..4b83124c7 100644 --- a/kernel/x86_64/daxpy_microk_piledriver-2.c +++ b/kernel/x86_64/daxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c index 85e038cef..db9a45de8 100644 --- a/kernel/x86_64/daxpy_microk_sandy-2.c +++ b/kernel/x86_64/daxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c index e40009037..8e63fcc1d 100644 --- a/kernel/x86_64/daxpy_microk_steamroller-2.c +++ b/kernel/x86_64/daxpy_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "subq $16, %1 \n\t" "jnz 1b \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c index 9756ee46a..5590c5b17 100644 --- a/kernel/x86_64/ddot_microk_bulldozer-2.c +++ b/kernel/x86_64/ddot_microk_bulldozer-2.c @@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c index 365737363..dbb5487f7 100644 --- a/kernel/x86_64/ddot_microk_haswell-2.c +++ b/kernel/x86_64/ddot_microk_haswell-2.c @@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c index fb5ec9bca..e5e234e22 100644 --- a/kernel/x86_64/ddot_microk_nehalem-2.c +++ b/kernel/x86_64/ddot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movsd %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c index ac950885c..cc4bcd90a 100644 --- a/kernel/x86_64/ddot_microk_piledriver-2.c +++ b/kernel/x86_64/ddot_microk_piledriver-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c index 160f95604..84493ec27 100644 --- a/kernel/x86_64/ddot_microk_sandy-2.c +++ b/kernel/x86_64/ddot_microk_sandy-2.c @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c index 5ce20b5de..27d5244ce 100644 --- a/kernel/x86_64/ddot_microk_steamroller-2.c +++ b/kernel/x86_64/ddot_microk_steamroller-2.c @@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovsd %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c index 3a743d64c..7099ba4c6 100644 --- a/kernel/x86_64/saxpy_microk_haswell-2.c +++ b/kernel/x86_64/saxpy_microk_haswell-2.c @@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c index 68f68ea3a..88bbb695d 100644 --- a/kernel/x86_64/saxpy_microk_nehalem-2.c +++ b/kernel/x86_64/saxpy_microk_nehalem-2.c @@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c index 204cf8bac..5feea7f24 100644 --- a/kernel/x86_64/saxpy_microk_piledriver-2.c +++ b/kernel/x86_64/saxpy_microk_piledriver-2.c @@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 @@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c index 0a6bef046..0d448d5f8 100644 --- a/kernel/x86_64/saxpy_microk_sandy-2.c +++ b/kernel/x86_64/saxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c index 36e61b077..8958a33dc 100644 --- a/kernel/x86_64/sdot_microk_bulldozer-2.c +++ b/kernel/x86_64/sdot_microk_bulldozer-2.c @@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c index df367b61f..91dc928d3 100644 --- a/kernel/x86_64/sdot_microk_haswell-2.c +++ b/kernel/x86_64/sdot_microk_haswell-2.c @@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c index 1a27177f5..5a715d008 100644 --- a/kernel/x86_64/sdot_microk_nehalem-2.c +++ b/kernel/x86_64/sdot_microk_nehalem-2.c @@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "movss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c index ca13536f2..ae25d5a50 100644 --- a/kernel/x86_64/sdot_microk_sandy-2.c +++ b/kernel/x86_64/sdot_microk_sandy-2.c @@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c index 6b8b2566b..bf6a5f287 100644 --- a/kernel/x86_64/sdot_microk_steamroller-2.c +++ b/kernel/x86_64/sdot_microk_steamroller-2.c @@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovss %%xmm4, (%4) \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 0e15761f7..15d367971 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c index 30e8b1955..89d23daf3 100644 --- a/kernel/x86_64/zaxpy_microk_haswell-2.c +++ b/kernel/x86_64/zaxpy_microk_haswell-2.c @@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c index 233af143a..17b8b24f7 100644 --- a/kernel/x86_64/zaxpy_microk_sandy-2.c +++ b/kernel/x86_64/zaxpy_microk_sandy-2.c @@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c index 728d09213..907b1ae00 100644 --- a/kernel/x86_64/zaxpy_microk_steamroller-2.c +++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c @@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 @@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "jnz 1b \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha), // 4 diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c index 30a9552d6..db9a48cce 100644 --- a/kernel/x86_64/zdot_microk_bulldozer-2.c +++ b/kernel/x86_64/zdot_microk_bulldozer-2.c @@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c index 11056a3c1..9f2fc2c1d 100644 --- a/kernel/x86_64/zdot_microk_haswell-2.c +++ b/kernel/x86_64/zdot_microk_haswell-2.c @@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c index 87c5b0340..33415e26e 100644 --- a/kernel/x86_64/zdot_microk_sandy-2.c +++ b/kernel/x86_64/zdot_microk_sandy-2.c @@ -107,10 +107,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -199,10 +199,10 @@ if ( n < 1280 ) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c index 325f74ae3..87138fe9a 100644 --- a/kernel/x86_64/zdot_microk_steamroller-2.c +++ b/kernel/x86_64/zdot_microk_steamroller-2.c @@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 @@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) "vmovups %%xmm4, 16(%4) \n\t" "vzeroupper \n\t" - : - : - "r" (i), // 0 - "r" (n), // 1 + : + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (dot) // 4 From b495e54310a99049c50c20425269f4b026b47dbb Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:07 +0100 Subject: [PATCH 027/133] Fix declaration of input arguments in the x86_64 SCAL microkernels (#1966) * Tag arguments 0 and 1 as both input and output (see #1964) --- kernel/x86_64/cscal_microk_bulldozer-2.c | 32 +++++++++++----------- kernel/x86_64/cscal_microk_haswell-2.c | 30 ++++++++++---------- kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++---- kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++---- kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++---- kernel/x86_64/zscal_microk_bulldozer-2.c | 28 +++++++++---------- kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++----------- kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- 9 files changed, 111 insertions(+), 111 deletions(-) diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c index 3abffc4cf..31451aa6c 100644 --- a/kernel/x86_64/cscal_microk_bulldozer-2.c +++ b/kernel/x86_64/cscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c index 0a4eb683c..a04a4c4ab 100644 --- a/kernel/x86_64/cscal_microk_haswell-2.c +++ b/kernel/x86_64/cscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", // "0", "1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" - : - : - "r" (n), // 0 - "r" (x), // 1 + : + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c index 8346e1748..e8073d485 100644 --- a/kernel/x86_64/cscal_microk_steamroller-2.c +++ b/kernel/x86_64/cscal_microk_steamroller-2.c @@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"0", "1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c index de53b0bc4..096662781 100644 --- a/kernel/x86_64/dscal_microk_bulldozer-2.c +++ b/kernel/x86_64/dscal_microk_bulldozer-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c index e732a2718..77ed59a4e 100644 --- a/kernel/x86_64/dscal_microk_haswell-2.c +++ b/kernel/x86_64/dscal_microk_haswell-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c index 8d855072b..9982b8e58 100644 --- a/kernel/x86_64/dscal_microk_sandy-2.c +++ b/kernel/x86_64/dscal_microk_sandy-2.c @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n1), // 0 - "r" (x), // 1 + "+r" (n1), // 0 + "+r" (x) // 1 + : "r" (alpha), // 2 "r" (n2) // 3 : "cc", @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n1), // 0 + "+r" (x) // 1 : - : - "r" (n1), // 0 - "r" (x), // 1 "r" (alpha), // 2 "r" (n2) // 3 : "cc", diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c index 03882d6b6..5e733ffda 100644 --- a/kernel/x86_64/zscal_microk_bulldozer-2.c +++ b/kernel/x86_64/zscal_microk_bulldozer-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", @@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 : "cc", //"%0", "%1", "%xmm0", "%xmm1", "%xmm2", "%xmm3", diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c index d9253c1ed..8c8f5b75c 100644 --- a/kernel/x86_64/zscal_microk_haswell-2.c +++ b/kernel/x86_64/zscal_microk_haswell-2.c @@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c index 97b07add6..c9267ee0c 100644 --- a/kernel/x86_64/zscal_microk_steamroller-2.c +++ b/kernel/x86_64/zscal_microk_steamroller-2.c @@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" + : + "+r" (n), // 0 + "+r" (x) // 1 : - : - "r" (n), // 0 - "r" (x), // 1 "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", @@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) "vzeroupper \n\t" : - : - "r" (n), // 0 - "r" (x), // 1 + "+r" (n), // 0 + "+r" (x) // 1 + : "r" (alpha) // 2 - : "cc", //"%0", "%1", + : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%xmm11", From 32b0f1168ec5eb93e146245d732c5a2fa9d73282 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 18 Jan 2019 08:11:39 +0100 Subject: [PATCH 028/133] Fix declaration of input arguments in the Sandybridge GER microkernels (#1967) * Tag arguments 0 and 1 as both input and output --- kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c index 2bf966a5f..e8494500f 100644 --- a/kernel/x86_64/dger_microk_sandy-2.c +++ b/kernel/x86_64/dger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c index 79180b991..14f13475b 100644 --- a/kernel/x86_64/sger_microk_sandy-2.c +++ b/kernel/x86_64/sger_microk_sandy-2.c @@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (alpha) // 4 From cda81cfae0e3dc18b1c2e9d05d6e0f8e1bec3917 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 00:10:01 +0100 Subject: [PATCH 029/133] Shift transition to multithreading towards larger matrix sizes See #1886 and JuliaRobotics issue 500. trsm benchmarks on Haswell and Zen showed that with these values performance is roughly doubled for matrix sizes between 8x8 and 14x14, and still 10 to 20 percent better near the new cutoff at 32x32. --- interface/trsm.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index 5c2750e79..faec03ac2 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -81,6 +81,12 @@ #endif #endif +#ifndef COMPLEX +#define SMP_FACTOR 8 +#else +#define SMP_FACTOR 4 +#endif + static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { #ifndef TRMM TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, @@ -366,10 +372,10 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From bbfdd6c0fe1e7d90099fe14f1e1f2fd775a47a36 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 19 Jan 2019 23:01:31 +0100 Subject: [PATCH 030/133] Increase Zen SWITCH_RATIO to 16 following GEMM benchmarks on Ryzen2700X. For #1464 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index fa6730208..15ea663a8 100644 --- a/param.h +++ b/param.h @@ -605,7 +605,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 8 -#define SWITCH_RATIO 4 +#define SWITCH_RATIO 16 #ifdef ARCH_X86 From 83b5c6b92dc6f66becae1418beef60042eb92c6d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 20 Jan 2019 12:18:53 +0100 Subject: [PATCH 031/133] Fix compilation with NO_AVX=1 set fixes #1974 --- cpuid_x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 726014033..c45ddd968 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -228,7 +228,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From b111829226874550c524b36882ff84c90008f494 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 21 Jan 2019 15:56:04 +0200 Subject: [PATCH 032/133] [ZARCH] Update max/min functions --- kernel/zarch/camax.c | 162 +++++++++++++++++-------------------- kernel/zarch/camin.c | 180 +++++++++++++++++++----------------------- kernel/zarch/damax.c | 108 ++++++++----------------- kernel/zarch/damin.c | 110 ++++++++------------------ kernel/zarch/dmax.c | 89 ++++++++------------- kernel/zarch/dmin.c | 89 ++++++++------------- kernel/zarch/icamax.c | 33 ++++---- kernel/zarch/icamin.c | 31 ++++---- kernel/zarch/idamax.c | 51 ++++++------ kernel/zarch/idamin.c | 51 ++++++------ kernel/zarch/idmax.c | 51 ++++++------ kernel/zarch/idmin.c | 51 ++++++------ kernel/zarch/isamax.c | 55 +++++++------ kernel/zarch/isamin.c | 55 +++++++------ kernel/zarch/ismax.c | 55 +++++++------ kernel/zarch/ismin.c | 55 +++++++------ kernel/zarch/izamax.c | 27 ++++--- kernel/zarch/izamin.c | 27 ++++--- kernel/zarch/samax.c | 111 ++++++++------------------ kernel/zarch/samin.c | 111 ++++++++------------------ kernel/zarch/smax.c | 92 ++++++++------------- kernel/zarch/smin.c | 92 ++++++++------------- kernel/zarch/zamax.c | 118 +++++++++++++-------------- kernel/zarch/zamin.c | 118 +++++++++++++-------------- 24 files changed, 805 insertions(+), 1117 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2c913b62e..66d250896 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -55,7 +55,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%2) \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" @@ -93,100 +93,88 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),1 \n\t" - "vlef %%v17,140(%%r1,%2),1 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),1 \n\t" - "vlef %%v19,172(%%r1,%2),1 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),1 \n\t" - "vlef %%v21,204(%%r1,%2),1 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),1 \n\t" - "vlef %%v23,236(%%r1,%2),1 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 733f98fbf..5abc685b2 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -43,8 +43,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) __asm__ volatile ( "vlef %%v0,0(%2),0 \n\t" "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),0 \n\t" - "vlef %%v16,12(%2),0 \n\t" + "vlef %%v0,8(%2),1 \n\t" + "vlef %%v16,12(%2),1 \n\t" "vlef %%v0,16(%2),2 \n\t" "vlef %%v16,20(%2),2 \n\t" "vlef %%v0,24(%2),3 \n\t" @@ -59,8 +59,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v16,0(%%r1,%2),0 \n\t" "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),0 \n\t" + "vlef %%v16,8(%%r1,%2),1 \n\t" + "vlef %%v17,12(%%r1,%2),1 \n\t" "vlef %%v16,16(%%r1,%2),2 \n\t" "vlef %%v17,20(%%r1,%2),2 \n\t" "vlef %%v16,24(%%r1,%2),3 \n\t" @@ -68,8 +68,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v18,32(%%r1,%2),0 \n\t" "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),0 \n\t" - "vlef %%v19,44(%%r1,%2),0 \n\t" + "vlef %%v18,40(%%r1,%2),1 \n\t" + "vlef %%v19,44(%%r1,%2),1 \n\t" "vlef %%v18,48(%%r1,%2),2 \n\t" "vlef %%v19,52(%%r1,%2),2 \n\t" "vlef %%v18,56(%%r1,%2),3 \n\t" @@ -77,8 +77,8 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v20,64(%%r1,%2),0 \n\t" "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),0 \n\t" - "vlef %%v21,76(%%r1,%2),0 \n\t" + "vlef %%v20,72(%%r1,%2),1 \n\t" + "vlef %%v21,76(%%r1,%2),1 \n\t" "vlef %%v20,80(%%r1,%2),2 \n\t" "vlef %%v21,84(%%r1,%2),2 \n\t" "vlef %%v20,88(%%r1,%2),3 \n\t" @@ -86,107 +86,95 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vlef %%v22,96(%%r1,%2),0 \n\t" "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),0 \n\t" - "vlef %%v23,108(%%r1,%2),0 \n\t" + "vlef %%v22,104(%%r1,%2),1 \n\t" + "vlef %%v23,108(%%r1,%2),1 \n\t" "vlef %%v22,112(%%r1,%2),2 \n\t" "vlef %%v23,116(%%r1,%2),2 \n\t" "vlef %%v22,120(%%r1,%2),3 \n\t" "vlef %%v23,124(%%r1,%2),3 \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vlef %%v16,128(%%r1,%2),0 \n\t" - "vlef %%v17,132(%%r1,%2),0 \n\t" - "vlef %%v16,136(%%r1,%2),0 \n\t" - "vlef %%v17,140(%%r1,%2),0 \n\t" - "vlef %%v16,144(%%r1,%2),2 \n\t" - "vlef %%v17,148(%%r1,%2),2 \n\t" - "vlef %%v16,152(%%r1,%2),3 \n\t" - "vlef %%v17,156(%%r1,%2),3 \n\t" - - "vlef %%v18,160(%%r1,%2),0 \n\t" - "vlef %%v19,164(%%r1,%2),0 \n\t" - "vlef %%v18,168(%%r1,%2),0 \n\t" - "vlef %%v19,172(%%r1,%2),0 \n\t" - "vlef %%v18,176(%%r1,%2),2 \n\t" - "vlef %%v19,180(%%r1,%2),2 \n\t" - "vlef %%v18,184(%%r1,%2),3 \n\t" - "vlef %%v19,188(%%r1,%2),3 \n\t" - - "vlef %%v20,192(%%r1,%2),0 \n\t" - "vlef %%v21,196(%%r1,%2),0 \n\t" - "vlef %%v20,200(%%r1,%2),0 \n\t" - "vlef %%v21,204(%%r1,%2),0 \n\t" - "vlef %%v20,208(%%r1,%2),2 \n\t" - "vlef %%v21,212(%%r1,%2),2 \n\t" - "vlef %%v20,216(%%r1,%2),3 \n\t" - "vlef %%v21,220(%%r1,%2),3 \n\t" - - "vlef %%v22,224(%%r1,%2),0 \n\t" - "vlef %%v23,228(%%r1,%2),0 \n\t" - "vlef %%v22,232(%%r1,%2),0 \n\t" - "vlef %%v23,236(%%r1,%2),0 \n\t" - "vlef %%v22,240(%%r1,%2),2 \n\t" - "vlef %%v23,244(%%r1,%2),2 \n\t" - "vlef %%v22,248(%%r1,%2),3 \n\t" - "vlef %%v23,252(%%r1,%2),3 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vlef %%v24,128(%%r1,%2),0 \n\t" + "vlef %%v25,132(%%r1,%2),0 \n\t" + "vlef %%v24,136(%%r1,%2),1 \n\t" + "vlef %%v25,140(%%r1,%2),1 \n\t" + "vlef %%v24,144(%%r1,%2),2 \n\t" + "vlef %%v25,148(%%r1,%2),2 \n\t" + "vlef %%v24,152(%%r1,%2),3 \n\t" + "vlef %%v25,156(%%r1,%2),3 \n\t" + + "vlef %%v26,160(%%r1,%2),0 \n\t" + "vlef %%v27,164(%%r1,%2),0 \n\t" + "vlef %%v26,168(%%r1,%2),1 \n\t" + "vlef %%v27,172(%%r1,%2),1 \n\t" + "vlef %%v26,176(%%r1,%2),2 \n\t" + "vlef %%v27,180(%%r1,%2),2 \n\t" + "vlef %%v26,184(%%r1,%2),3 \n\t" + "vlef %%v27,188(%%r1,%2),3 \n\t" + + "vlef %%v28,192(%%r1,%2),0 \n\t" + "vlef %%v29,196(%%r1,%2),0 \n\t" + "vlef %%v28,200(%%r1,%2),1 \n\t" + "vlef %%v29,204(%%r1,%2),1 \n\t" + "vlef %%v28,208(%%r1,%2),2 \n\t" + "vlef %%v29,212(%%r1,%2),2 \n\t" + "vlef %%v28,216(%%r1,%2),3 \n\t" + "vlef %%v29,220(%%r1,%2),3 \n\t" + + "vlef %%v30,224(%%r1,%2),0 \n\t" + "vlef %%v31,228(%%r1,%2),0 \n\t" + "vlef %%v30,232(%%r1,%2),1 \n\t" + "vlef %%v31,236(%%r1,%2),1 \n\t" + "vlef %%v30,240(%%r1,%2),2 \n\t" + "vlef %%v31,244(%%r1,%2),2 \n\t" + "vlef %%v30,248(%%r1,%2),3 \n\t" + "vlef %%v31,252(%%r1,%2),3 \n\t" + + "vflpsb %%v16,%%v16 \n\t" + "vflpsb %%v17,%%v17 \n\t" + "vflpsb %%v18,%%v18 \n\t" + "vflpsb %%v19,%%v19 \n\t" + "vflpsb %%v20,%%v20 \n\t" + "vflpsb %%v21,%%v21 \n\t" + "vflpsb %%v22,%%v22 \n\t" + "vflpsb %%v23,%%v23 \n\t" + "vflpsb %%v24,%%v24 \n\t" + "vflpsb %%v25,%%v25 \n\t" + "vflpsb %%v26,%%v26 \n\t" + "vflpsb %%v27,%%v27 \n\t" + "vflpsb %%v28,%%v28 \n\t" + "vflpsb %%v29,%%v29 \n\t" + "vflpsb %%v30,%%v30 \n\t" + "vflpsb %%v31,%%v31 \n\t" + "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" + "vfasb %%v18,%%v18,%%v19 \n\t" + "vfasb %%v20,%%v20,%%v21 \n\t" + "vfasb %%v22,%%v22,%%v23 \n\t" + "vfasb %%v24,%%v24,%%v25 \n\t" + "vfasb %%v26,%%v26,%%v27 \n\t" + "vfasb %%v28,%%v28,%%v29 \n\t" + "vfasb %%v30,%%v30,%%v31 \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfchsb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfchsb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -233,11 +221,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 236d11c72..a3d63fe53 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -39,8 +39,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) FLOAT amax; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -54,79 +53,42 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index c2c63c6c5..738ed8710 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -39,11 +39,10 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) FLOAT amin; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" "vl %%v16,0(%%r1,%2) \n\t" @@ -54,79 +53,42 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,8 \n\t" + "vfmindb %%v17,%%v17,%%v25,8 \n\t" + "vfmindb %%v18,%%v18,%%v26,8 \n\t" + "vfmindb %%v19,%%v19,%%v27,8 \n\t" + "vfmindb %%v20,%%v20,%%v28,8 \n\t" + "vfmindb %%v21,%%v21,%%v29,8 \n\t" + "vfmindb %%v22,%%v22,%%v30,8 \n\t" + "vfmindb %%v23,%%v23,%%v31,8 \n\t" + + "vfmindb %%v16,%%v16,%%v20,8 \n\t" + "vfmindb %%v17,%%v17,%%v21,8 \n\t" + "vfmindb %%v18,%%v18,%%v22,8 \n\t" + "vfmindb %%v19,%%v19,%%v23,8 \n\t" + + "vfmindb %%v16,%%v16,%%v18,8 \n\t" + "vfmindb %%v17,%%v17,%%v19,8 \n\t" + + "vfmindb %%v16,%%v16,%%v17,8 \n\t" + + "vfmindb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ldr %0,%%f0 " + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,8 \n\t" + "lpdr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -168,11 +130,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 469f65735..aa8b932f9 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -32,7 +32,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) FLOAT max; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxdb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxdb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxdb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxdb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 3df504950..8ae5fe868 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -32,7 +32,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) FLOAT min; __asm__ volatile ( - "vl %%v0,0(%2) \n\t" + "vl %%v0,0(%2) \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" @@ -46,62 +46,41 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v17,%%v17,%%v25,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v19,%%v19,%%v27,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v21,%%v21,%%v29,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + "vfmindb %%v23,%%v23,%%v31,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v17,%%v17,%%v21,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" + "vfmindb %%v19,%%v19,%%v23,0 \n\t" + + "vfmindb %%v16,%%v16,%%v18,0 \n\t" + "vfmindb %%v17,%%v17,%%v19,0 \n\t" + + "vfmindb %%v16,%%v16,%%v17,0 \n\t" + + "vfmindb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepg %%v16,%%v0,1 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -144,11 +123,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 9b4077c6b..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -76,7 +76,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" - "pfd 1, 1024(%3) \n\t" + "pfd 1, 1024(%%r1,%3) \n\t" "vlef %%v16,0(%%r1,%3),0 \n\t" "vlef %%v17,4(%%r1,%3),0 \n\t" @@ -127,14 +127,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 6e952a325..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -127,14 +127,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -142,13 +142,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vlef %%v16,128(%%r1,%3),0 \n\t" @@ -200,14 +200,14 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vfasb %%v18,%%v20,%%v21 \n\t" "vfasb %%v19,%%v22,%%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -215,13 +215,13 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -250,8 +250,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -302,6 +302,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4f7ff6985..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -63,7 +63,7 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 3abc7a558..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -63,7 +63,7 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -83,10 +83,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -96,21 +96,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -130,10 +130,10 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpdb %%v22, %%v22 \n\t" "vflpdb %%v23, %%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -143,21 +143,21 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -175,8 +175,8 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -221,12 +221,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 313a88db4..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -55,7 +55,7 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" - "vfchdb %%v6,%%v20,%%v21 \n\t" - "vfchdb %%v7,%%v22,%%v23 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v6,%%v20,%%v21 \n\t" + "vfchedb %%v7,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v16,%%v17 \n\t" - "vfchdb %%v21,%%v18,%%v19 \n\t" + "vfchedb %%v20,%%v16,%%v17 \n\t" + "vfchedb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 42443215b..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -55,7 +55,7 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vleig %%v31,15,1 \n\t" "srlg %%r0,%2,5 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -67,10 +67,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -80,21 +80,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -106,10 +106,10 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" - "vfchdb %%v6,%%v21,%%v20 \n\t" - "vfchdb %%v7,%%v23,%%v22 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v6,%%v21,%%v20 \n\t" + "vfchedb %%v7,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" @@ -119,21 +119,21 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v7 \n\t" "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - "vfchdb %%v20,%%v17,%%v16 \n\t" - "vfchdb %%v21,%%v19,%%v18 \n\t" + "vfchedb %%v20,%%v17,%%v16 \n\t" + "vfchedb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v4,%%v4,%%v5,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -151,8 +151,8 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -197,12 +197,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index dd2144db2..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -81,7 +81,7 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { max = j; diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index d7e44421d..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -81,7 +81,7 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -101,10 +101,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -114,14 +114,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -129,13 +129,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -155,10 +155,10 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vflpsb %%v22, %%v22 \n\t" "vflpsb %%v23, %%v23 \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -168,14 +168,14 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -183,13 +183,13 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -218,8 +218,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -264,12 +264,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { min = j; diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 1ebc6c8c8..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -73,7 +73,7 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v16,%%v17 \n\t" - "vfchsb %%v6,%%v18,%%v19 \n\t" - "vfchsb %%v7,%%v20,%%v21 \n\t" - "vfchsb %%v8,%%v22,%%v23 \n\t" + "vfchesb %%v5,%%v16,%%v17 \n\t" + "vfchesb %%v6,%%v18,%%v19 \n\t" + "vfchesb %%v7,%%v20,%%v21 \n\t" + "vfchesb %%v8,%%v22,%%v23 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v16,%%v17 \n\t" - "vfchsb %%v21,%%v18,%%v19 \n\t" + "vfchesb %%v20,%%v16,%%v17 \n\t" + "vfchesb %%v21,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v16,%%v17 \n\t" + "vfchesb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "wfchsb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imax),"=m"(*max) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { max = j; diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index a6b9d59de..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -73,7 +73,7 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vleif %%v31,31,3 \n\t" "srlg %%r0,%2,6 \n\t" "xgr %%r1,%%r1 \n\t" - "0: \n\t" + "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" "vl %%v16,0(%%r1,%3) \n\t" @@ -85,10 +85,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,96(%%r1,%3) \n\t" "vl %%v23,112(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -98,14 +98,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -113,13 +113,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "vl %%v16,128(%%r1,%3) \n\t" @@ -131,10 +131,10 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vl %%v22,224(%%r1,%3) \n\t" "vl %%v23,240(%%r1,%3) \n\t" - "vfchsb %%v5,%%v17,%%v16 \n\t" - "vfchsb %%v6,%%v19,%%v18 \n\t" - "vfchsb %%v7,%%v21,%%v20 \n\t" - "vfchsb %%v8,%%v23,%%v22 \n\t" + "vfchesb %%v5,%%v17,%%v16 \n\t" + "vfchesb %%v6,%%v19,%%v18 \n\t" + "vfchesb %%v7,%%v21,%%v20 \n\t" + "vfchesb %%v8,%%v23,%%v22 \n\t" "vsel %%v16,%%v16,%%v17,%%v5 \n\t" "vsel %%v5,%%v24,%%v25,%%v5 \n\t" "vsel %%v17,%%v18,%%v19,%%v6 \n\t" @@ -144,14 +144,14 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vsel %%v19,%%v22,%%v23,%%v8 \n\t" "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - "vfchsb %%v20,%%v17,%%v16 \n\t" - "vfchsb %%v21,%%v19,%%v18 \n\t" + "vfchesb %%v20,%%v17,%%v16 \n\t" + "vfchesb %%v21,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v20 \n\t" "vsel %%v5,%%v5,%%v6,%%v20 \n\t" "vsel %%v17,%%v18,%%v19,%%v21 \n\t" "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - "vfchsb %%v18,%%v17,%%v16 \n\t" + "vfchesb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v5,%%v5,%%v6,%%v18 \n\t" "vsegf %%v6,%%v5 \n\t" @@ -159,13 +159,13 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "vag %%v5,%%v5,%%v4 \n\t" "vag %%v6,%%v6,%%v4 \n\t" - "vfchsb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v7 \n\t" + "vfchesb %%v7,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v7 \n\t" "vsegf %%v8,%%v7 \n\t" "vesrlg %%v7,%%v7,32 \n\t" "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v5,%%v1,%%v7 \n\t" - "vsel %%v2,%%v6,%%v2,%%v8 \n\t" + "vsel %%v1,%%v1,%%v5,%%v7 \n\t" + "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" "agfi %%r1, 256 \n\t" @@ -194,8 +194,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "wfchsb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "ste %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(imin),"=m"(*min) @@ -240,12 +240,11 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { min = j; diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 541464b05..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -93,21 +93,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v16,%%v17 \n\t" - "vfchdb %%v5,%%v18,%%v19 \n\t" + "vfchedb %%v4,%%v16,%%v17 \n\t" + "vfchedb %%v5,%%v18,%%v19 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v16,%%v17 \n\t" + "vfchedb %%v18,%%v16,%%v17 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) "wfchdb %%v4,%%v2,%%v0 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamax),"=m"(*amax) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 4b5572b80..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -93,21 +93,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "vleg %%v16,128(%%r1,%3),0 \n\t" @@ -139,21 +139,21 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "vfadb %%v18,%%v20,%%v21 \n\t" "vfadb %%v19,%%v22,%%v23 \n\t" - "vfchdb %%v4,%%v17,%%v16 \n\t" - "vfchdb %%v5,%%v19,%%v18 \n\t" + "vfchedb %%v4,%%v17,%%v16 \n\t" + "vfchedb %%v5,%%v19,%%v18 \n\t" "vsel %%v16,%%v16,%%v17,%%v4 \n\t" "vsel %%v4,%%v24,%%v25,%%v4 \n\t" "vsel %%v17,%%v18,%%v19,%%v5 \n\t" "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vfchdb %%v18,%%v17,%%v16 \n\t" + "vfchedb %%v18,%%v17,%%v16 \n\t" "vsel %%v16,%%v16,%%v17,%%v18 \n\t" "vsel %%v4,%%v4,%%v5,%%v18 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vfchdb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v5 \n\t" - "vsel %%v1,%%v4,%%v1,%%v5 \n\t" + "vfchedb %%v5,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v5 \n\t" + "vsel %%v1,%%v1,%%v4,%%v5 \n\t" "vag %%v3,%%v3,%%v2 \n\t" "agfi %%r1, 256 \n\t" @@ -171,8 +171,8 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) "wfchdb %%v4,%%v0,%%v2 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "vlgvg %0,%%v1,0 \n\t" "std %%f0,%1 \n\t" + "vlgvg %0,%%v1,0 \n\t" "2: \n\t" "nop " :"=r"(iamin),"=m"(*amin) @@ -223,6 +223,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); inc_x2 = 2 * inc_x; ix += inc_x2; diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index 61d50159f..c8d831d06 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -40,8 +40,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) > maxf) { maxf = ABS(x[i]); diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index a585a79ff..dd24c74d7 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -40,8 +40,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -54,83 +53,45 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,8 \n\t" + "vfminsb %%v17,%%v17,%%v25,8 \n\t" + "vfminsb %%v18,%%v18,%%v26,8 \n\t" + "vfminsb %%v19,%%v19,%%v27,8 \n\t" + "vfminsb %%v20,%%v20,%%v28,8 \n\t" + "vfminsb %%v21,%%v21,%%v29,8 \n\t" + "vfminsb %%v22,%%v22,%%v30,8 \n\t" + "vfminsb %%v23,%%v23,%%v31,8 \n\t" + + "vfminsb %%v16,%%v16,%%v20,8 \n\t" + "vfminsb %%v17,%%v17,%%v21,8 \n\t" + "vfminsb %%v18,%%v18,%%v22,8 \n\t" + "vfminsb %%v19,%%v19,%%v23,8 \n\t" + + "vfminsb %%v16,%%v16,%%v18,8 \n\t" + "vfminsb %%v17,%%v17,%%v19,8 \n\t" + + "vfminsb %%v16,%%v16,%%v17,8 \n\t" + + "vfminsb %%v0,%%v0,%%16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" - "ler %0,%%f0 " + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,8 \n\t" + "lper %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" @@ -172,11 +133,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=ABS(x[0]); - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (ABS(x[i]) < minf) { minf = ABS(x[i]); diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index bcdb473af..8a2b86dc1 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -33,7 +33,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v16,%%v17 \n\t" - "vfchsb %%v25,%%v18,%%v19 \n\t" - "vfchsb %%v26,%%v20,%%v21 \n\t" - "vfchsb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v24,%%v25 \n\t" - "vfchsb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v25,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v27,0 \n\t" + "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxsb %%v21,%%v21,%%v29,0 \n\t" + "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" + "vfmaxsb %%v23,%%v23,%%v31,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v21,0 \n\t" + "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" + "vfmaxsb %%v19,%%v19,%%v23,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" + "vfmaxsb %%v17,%%v17,%%v19,0 \n\t" + + "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" + + "vfmaxsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(max) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { maxf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] > maxf) { maxf = x[i]; diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 91c31d284..b87ec0fe8 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -33,7 +33,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) __asm__ volatile ( "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" + "srlg %%r0,%1,6 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" @@ -46,66 +46,44 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vl %%v21,80(%%r1,%2) \n\t" "vl %%v22,96(%%r1,%2) \n\t" "vl %%v23,112(%%r1,%2) \n\t" + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v25,144(%%r1,%2) \n\t" + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v27,176(%%r1,%2) \n\t" + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v29,208(%%r1,%2) \n\t" + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v31,240(%%r1,%2) \n\t" - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchsb %%v24,%%v17,%%v16 \n\t" - "vfchsb %%v25,%%v19,%%v18 \n\t" - "vfchsb %%v26,%%v21,%%v20 \n\t" - "vfchsb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchsb %%v28,%%v25,%%v24 \n\t" - "vfchsb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchsb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchsb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + "vfminsb %%v16,%%v16,%%v24,0 \n\t" + "vfminsb %%v17,%%v17,%%v25,0 \n\t" + "vfminsb %%v18,%%v18,%%v26,0 \n\t" + "vfminsb %%v19,%%v19,%%v27,0 \n\t" + "vfminsb %%v20,%%v20,%%v28,0 \n\t" + "vfminsb %%v21,%%v21,%%v29,0 \n\t" + "vfminsb %%v22,%%v22,%%v30,0 \n\t" + "vfminsb %%v23,%%v23,%%v31,0 \n\t" + + "vfminsb %%v16,%%v16,%%v20,0 \n\t" + "vfminsb %%v17,%%v17,%%v21,0 \n\t" + "vfminsb %%v18,%%v18,%%v22,0 \n\t" + "vfminsb %%v19,%%v19,%%v23,0 \n\t" + + "vfminsb %%v16,%%v16,%%v18,0 \n\t" + "vfminsb %%v17,%%v17,%%v19,0 \n\t" + + "vfminsb %%v16,%%v16,%%v17,0 \n\t" + + "vfminsb %%v0,%%v0,%%16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "veslg %%v16,%%v0,32 \n\t" - "vfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" - "vrepf %%v16,%%v0,2 \n\t" - "wfchsb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "vrepf %%v16,%%v0,2 \n\t" + "wfminsb %%v0,%%v0,%%v16,0 \n\t" "ler %0,%%f0 " :"=f"(min) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -148,11 +126,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { minf=x[0]; - i += inc_x; - j++; - BLASLONG n1 = (n - 1) & -4; - while ((j - 1) < n1) { + BLASLONG n1 = n & -4; + while (j < n1) { if (x[i] < minf) { minf = x[i]; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8ef3f42ca..8175874c0 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -69,76 +69,66 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" + "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" + "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" + + "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" + "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { maxf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) > maxf) { maxf = CABS1(x,ix); diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 30fd1d030..5d57ff12e 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -69,76 +69,66 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "vleg %%v23,104(%%r1,%2),0 \n\t" "vleg %%v22,112(%%r1,%2),1 \n\t" "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" + "vleg %%v24,128(%%r1,%2),0 \n\t" + "vleg %%v25,136(%%r1,%2),0 \n\t" + "vleg %%v24,144(%%r1,%2),1 \n\t" + "vleg %%v25,152(%%r1,%2),1 \n\t" + "vleg %%v26,160(%%r1,%2),0 \n\t" + "vleg %%v27,168(%%r1,%2),0 \n\t" + "vleg %%v26,176(%%r1,%2),1 \n\t" + "vleg %%v27,184(%%r1,%2),1 \n\t" + "vleg %%v28,192(%%r1,%2),0 \n\t" + "vleg %%v29,200(%%r1,%2),0 \n\t" + "vleg %%v28,208(%%r1,%2),1 \n\t" + "vleg %%v29,216(%%r1,%2),1 \n\t" + "vleg %%v30,224(%%r1,%2),0 \n\t" + "vleg %%v31,232(%%r1,%2),0 \n\t" + "vleg %%v30,240(%%r1,%2),1 \n\t" + "vleg %%v31,248(%%r1,%2),1 \n\t" + + "vflpdb %%v16,%%v16 \n\t" + "vflpdb %%v17,%%v17 \n\t" + "vflpdb %%v18,%%v18 \n\t" + "vflpdb %%v19,%%v19 \n\t" + "vflpdb %%v20,%%v20 \n\t" + "vflpdb %%v21,%%v21 \n\t" + "vflpdb %%v22,%%v22 \n\t" + "vflpdb %%v23,%%v23 \n\t" + "vflpdb %%v24,%%v24 \n\t" + "vflpdb %%v25,%%v25 \n\t" + "vflpdb %%v26,%%v26 \n\t" + "vflpdb %%v27,%%v27 \n\t" + "vflpdb %%v28,%%v28 \n\t" + "vflpdb %%v29,%%v29 \n\t" + "vflpdb %%v30,%%v30 \n\t" + "vflpdb %%v31,%%v31 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" + "vfadb %%v18,%%v18,%%v19 \n\t" + "vfadb %%v20,%%v20,%%v21 \n\t" + "vfadb %%v22,%%v22,%%v23 \n\t" + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vfmindb %%v16,%%v16,%%v24,0 \n\t" + "vfmindb %%v18,%%v18,%%v26,0 \n\t" + "vfmindb %%v20,%%v20,%%v28,0 \n\t" + "vfmindb %%v22,%%v22,%%v30,0 \n\t" + + "vfmindb %%v16,%%v16,%%v20,0 \n\t" + "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v16,%%v0,%%v17 \n\t" + "wfmindb %%v0,%%v0,%%v16,0 \n\t" "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) @@ -185,11 +175,9 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { minf=CABS1(x,0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - BLASLONG n1 = (n - 1) & -4; - while ((i - 1) < n1) { + BLASLONG n1 = n & -4; + while (i < n1) { if (CABS1(x,ix) < minf) { minf = CABS1(x,ix); From 63bbd7b0d79d41da2a7cc81139a62b81fa247640 Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Mon, 21 Jan 2019 08:35:23 +0200 Subject: [PATCH 033/133] Better support for MSVC/Windows in CMake --- CMakeLists.txt | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..8f3abe4b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,19 @@ endif() ####### +if(MSVC AND MSVC_STATIC_CRT) + set(CompilerFlags + CMAKE_CXX_FLAGS + CMAKE_CXX_FLAGS_DEBUG + CMAKE_CXX_FLAGS_RELEASE + CMAKE_C_FLAGS + CMAKE_C_FLAGS_DEBUG + CMAKE_C_FLAGS_RELEASE + ) + foreach(CompilerFlag ${CompilerFlags}) + string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") + endforeach() +endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -149,12 +162,6 @@ if (${DYNAMIC_ARCH}) endforeach() endif () -# Only build shared libs for MSVC -if (MSVC) - set(BUILD_SHARED_LIBS ON) -endif() - - # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) @@ -314,7 +321,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NOFORTRAN) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) + set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") @@ -327,10 +334,11 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") + set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) + file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) endif() if(NOT NO_LAPACKE) From f0d834b824fd5723c5cd8df01ed1aaa7a78548c3 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 12:32:24 +0100 Subject: [PATCH 034/133] Use VERSION_LESS for comparisons involving software version numbers --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f3abe4b8..afd9d2cf2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -147,7 +147,7 @@ endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -173,7 +173,7 @@ endif() # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) From 24288803b3cde043bc4c10d82080509989680efb Mon Sep 17 00:00:00 2001 From: Daniel Cohen Gindi Date: Tue, 22 Jan 2019 14:38:01 +0200 Subject: [PATCH 035/133] Adjust test script for correct deployment --- appveyor.yml | 2 +- utest/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/appveyor.yml b/appveyor.yml index 141d3a130..95f6cf7c5 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 1b426afe7..dc306501f 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -61,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if (MSVC) +if (MSVC AND BUILD_SHARED_LIBS) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. From 21eda8b5774aa92aecb9babba0b3eda0a992ddb9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:47:12 +0100 Subject: [PATCH 036/133] Report SkylakeX as Haswell if compiler does not support AVX512 ... or make was invoked with NO_AVX512=1 --- getarch.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/getarch.c b/getarch.c index 78ba0fefd..d03ce6e98 100644 --- a/getarch.c +++ b/getarch.c @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif +#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) +#else +#define NO_AVX512 +#endif /* #define FORCE_P2 */ /* #define FORCE_KATMAI */ /* #define FORCE_COPPERMINE */ @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef FORCE_SKYLAKEX +#ifdef NO_AVX512 +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#else #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define LIBNAME "skylakex" #define CORENAME "SKYLAKEX" #endif +#endif #ifdef FORCE_ATOM #define FORCE From b56b34a75cf3ae253cf8904416c6716406aad1fd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 22 Jan 2019 18:55:43 +0100 Subject: [PATCH 037/133] Syntax fix --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 95f6cf7c5..741c66291 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -53,7 +53,7 @@ before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. From 8533aca96470d361cc5cc81da329190811951df1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 23 Jan 2019 10:03:00 +0100 Subject: [PATCH 038/133] Avoid penalizing tall skinny matrices --- interface/trsm.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/interface/trsm.c b/interface/trsm.c index faec03ac2..f2da285de 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -82,9 +82,9 @@ #endif #ifndef COMPLEX -#define SMP_FACTOR 8 +#define SMP_FACTOR 256 #else -#define SMP_FACTOR 4 +#define SMP_FACTOR 128 #endif static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { @@ -372,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, mode |= (trans << BLAS_TRANSA_SHIFT); mode |= (side << BLAS_RSIDE_SHIFT); - if ( args.m < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) +/* + if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; else - if ( args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD ) + if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) args.nthreads = 1; +*/ + if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) + args.nthreads = 1; else args.nthreads = num_cpu_avail(3); From e908ac2a5145ac1a0d43e6baf39df14ade061d57 Mon Sep 17 00:00:00 2001 From: Edison Gustavo Muenz Date: Wed, 23 Jan 2019 15:09:13 +0100 Subject: [PATCH 039/133] Fix include directory of exported targets --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 812e6bf6f..d3a9a2797 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -157,7 +157,7 @@ endif() # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) From e882b239aa75090c7871d5848a0ead7d37bafb6f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 00:45:45 +0100 Subject: [PATCH 040/133] Correct naming of getrf_parallel object fixes #1984 --- lapack/CMakeLists.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index c0a7543ca..d48a270ab 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -63,7 +63,6 @@ if (USE_THREAD) # these do not have 'z' versions set(PARALLEL_SOURCES - ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c @@ -81,6 +80,10 @@ if (USE_THREAD) trtri/trtri_L_parallel.c ) + foreach (float_type ${FLOAT_TYPES}) + GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) + endforeach() + GenerateNamedObjects("${PARALLEL_SOURCES}") endif () From 36b844af889374934a4c5af19cf371cf29731d2e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:47:22 +0100 Subject: [PATCH 041/133] Change ARMV8 target to ARMV7 when BINARY32 is set fixes #1961 --- Makefile.system | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile.system b/Makefile.system index 20d4f6492..67c8cd197 100644 --- a/Makefile.system +++ b/Makefile.system @@ -95,6 +95,9 @@ endif ifeq ($(TARGET), ZEN) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), ARMV8) +GETARCH_FLAGS := -DFORCE_ARMV7 +endif endif From 58dd7e4501ad55ca03ae1da783de72cc36345f61 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 26 Jan 2019 17:52:33 +0100 Subject: [PATCH 042/133] Change ARMV8 target to ARMV7 for BINARY=32 --- cmake/system.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/system.cmake b/cmake/system.cmake index a060d98cb..4cee7bd18 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") set(TARGET "BARCELONA") endif () + if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") + set(TARGET "ARMV7") + endif () endif () if (DEFINED TARGET) From 0f24b39ebf8945ddbe5d1516123e98b62853f5b4 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 15:33:00 +0100 Subject: [PATCH 043/133] Reword/expand comments in Makefile.rule Lots of small changes in the wording of the comments, plus an expansion of the NUM_THREADS and NO_AFFINITY sections. --- Makefile.rule | 48 +++++++++++++++++++++++++++++------------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index 7c128fb49..1d5dcacaa 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -48,6 +48,8 @@ VERSION = 0.3.6.dev # HOSTCC = gcc # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 +# Please note that AVX is not available on 32-bit. +# Setting BINARY=32 disables AVX/AVX2/AVX-512. # BINARY=64 # About threaded BLAS. It will be automatically detected if you don't @@ -57,7 +59,7 @@ VERSION = 0.3.6.dev # USE_THREAD = 0 # If you're going to use this library with OpenMP, please comment it in. -# This flag is always set for POWER8. Don't modify the flag +# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # USE_OPENMP = 1 # The OpenMP scheduler to use - by default this is "static" and you @@ -68,36 +70,39 @@ VERSION = 0.3.6.dev # allow you to select the scheduler from the environment variable OMP_SCHEDULE # CCOMMON_OPT += -DOMP_SCHED=dynamic -# You can define maximum number of threads. Basically it should be -# less than actual number of cores. If you don't specify one, it's +# You can define the maximum number of threads. Basically it should be less +# than or equal to the number of CPU threads. If you don't specify one, it's # automatically detected by the the script. +# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to +# restrict NUM_THREADS to the number of physical cores. By default, the automatic +# detection includes logical CPUs, thus allowing the use of SMT. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call -# OpenBLAS's calculation API from multi threads, please comment it in. -# This flag defines how many instances of OpenBLAS's calculation API can -# actually run in parallel. If more threads call OpenBLAS's calculation API, +# OpenBLAS's calculation API from multiple threads, please comment this in. +# This flag defines how many instances of OpenBLAS's calculation API can actually +# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# if you don't need to install the static library, please comment it in. +# If you don't need to generate the static library, please comment this in. # NO_STATIC = 1 -# if you don't need generate the shared library, please comment it in. +# If you don't need to generate the shared library, please comment this in. # NO_SHARED = 1 -# If you don't need CBLAS interface, please comment it in. +# If you don't need the CBLAS interface, please comment this in. # NO_CBLAS = 1 -# If you only want CBLAS interface without installing Fortran compiler, -# please comment it in. +# If you only want the CBLAS interface without installing a Fortran compiler, +# please comment this in. # ONLY_CBLAS = 1 -# If you don't need LAPACK, please comment it in. -# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. +# If you don't need LAPACK, please comment this in. +# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. # NO_LAPACK = 1 -# If you don't need LAPACKE (C Interface to LAPACK), please comment it in. +# If you don't need LAPACKE (C Interface to LAPACK), please comment this in. # NO_LAPACKE = 1 # Build LAPACK Deprecated functions since LAPACK 3.6.0 @@ -106,7 +111,7 @@ BUILD_LAPACK_DEPRECATED = 1 # Build RecursiveLAPACK on top of LAPACK # BUILD_RELAPACK = 1 -# If you want to use legacy threaded Level 3 implementation. +# If you want to use the legacy threaded Level 3 implementation. # USE_SIMPLE_THREADED_LEVEL3 = 1 # If you want to use the new, still somewhat experimental code that uses @@ -116,8 +121,8 @@ BUILD_LAPACK_DEPRECATED = 1 # USE_TLS = 1 # If you want to drive whole 64bit region by BLAS. Not all Fortran -# compiler supports this. It's safe to keep comment it out if you -# are not sure(equivalent to "-i8" option). +# compilers support this. It's safe to keep this commented out if you +# are not sure. (This is equivalent to the "-i8" ifort option). # INTERFACE64 = 1 # Unfortunately most of kernel won't give us high quality buffer. @@ -125,10 +130,15 @@ BUILD_LAPACK_DEPRECATED = 1 # but it will consume time. If you don't like it, you can disable one. NO_WARMUP = 1 -# If you want to disable CPU/Memory affinity on Linux. +# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. +# This feature is only implemented on Linux, and is always disabled on other platforms. +# Enabling affinity handling may improve performance, especially on NUMA systems, but +# it may conflict with certain applications that also try to manage affinity. +# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing +# else modifies affinity settings. NO_AFFINITY = 1 -# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus +# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus # BIGNUMA = 1 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers From ea1716ce2aaa4edf09e837796026ecd6cae9116b Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sun, 27 Jan 2019 17:22:26 +0100 Subject: [PATCH 044/133] Update Makefile.rule Revert generate to install, explain the nature of the affinity conflict --- Makefile.rule | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 1d5dcacaa..faf34c0a1 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -85,7 +85,7 @@ VERSION = 0.3.6.dev # they need to wait for the preceding API calls to finish or risk data corruption. # NUM_PARALLEL = 2 -# If you don't need to generate the static library, please comment this in. +# If you don't need to install the static library, please comment this in. # NO_STATIC = 1 # If you don't need to generate the shared library, please comment this in. @@ -134,6 +134,8 @@ NO_WARMUP = 1 # This feature is only implemented on Linux, and is always disabled on other platforms. # Enabling affinity handling may improve performance, especially on NUMA systems, but # it may conflict with certain applications that also try to manage affinity. +# This conflict can result in threads of the application calling OpenBLAS ending up locked +# to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. NO_AFFINITY = 1 From c8ef9fb22064dc6cb1c7515ad8d7e25c7adf9a8a Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:16:18 +0200 Subject: [PATCH 045/133] [ZARCH] Fix bug in iamax/iamin/imax/imin --- kernel/zarch/icamax.c | 1 + kernel/zarch/icamin.c | 1 + kernel/zarch/idamax.c | 1 + kernel/zarch/idamin.c | 1 + kernel/zarch/idmax.c | 1 + kernel/zarch/idmin.c | 1 + kernel/zarch/isamax.c | 1 + kernel/zarch/isamin.c | 1 + kernel/zarch/ismax.c | 1 + kernel/zarch/ismin.c | 1 + kernel/zarch/izamax.c | 1 + kernel/zarch/izamin.c | 1 + 12 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..96cb37a1d 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..73bd9e8de 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,6 +283,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7c..4a0114242 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f..503f92ff7 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,6 +206,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779..871c896e6 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49..dd14ec92c 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,6 +182,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..1a9ac3cd8 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..5a7e669eb 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,6 +249,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..0b144c200 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..7fda9dffc 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,6 +225,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { + min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..7db64181c 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..707d702d3 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,6 +204,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { + min = 0; minf = CABS1(x,0); ix += 2; i++; From 04873bb174d45a9cac478d7db7fd6f2618df2e81 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:32:24 +0200 Subject: [PATCH 046/133] [ZARCH] Undo the last commit --- kernel/zarch/icamax.c | 1 - kernel/zarch/icamin.c | 1 - kernel/zarch/idamax.c | 1 - kernel/zarch/idamin.c | 1 - kernel/zarch/idmax.c | 1 - kernel/zarch/idmin.c | 1 - kernel/zarch/isamax.c | 1 - kernel/zarch/isamin.c | 1 - kernel/zarch/ismax.c | 1 - kernel/zarch/ismin.c | 1 - kernel/zarch/izamax.c | 1 - kernel/zarch/izamin.c | 1 - 12 files changed, 12 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 96cb37a1d..27f969eee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 73bd9e8de..ae7b37b4f 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -283,7 +283,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 4a0114242..e5a1d3a7c 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 503f92ff7..a68f7282f 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -206,7 +206,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 871c896e6..4c3040779 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index dd14ec92c..ba1776a49 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -182,7 +182,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 1a9ac3cd8..2f5c1c867 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = ABS(x[0]); i++; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 5a7e669eb..04e05aad9 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -249,7 +249,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = ABS(x[0]); i++; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 0b144c200..084b4ce94 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - max = 0; maxf = x[0]; i++; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 7fda9dffc..4e85816a3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -225,7 +225,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { } else { - min = 0; minf = x[0]; i++; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 7db64181c..2ffad2570 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - max = 0; maxf = CABS1(x,0); ix += 2; i++; diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 707d702d3..1e037c0c7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -204,7 +204,6 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) } else { - min = 0; minf = CABS1(x,0); ix += 2; i++; From c7143c1019d7a35f94454e2ac811cd948a41d22e Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 28 Jan 2019 17:52:23 +0200 Subject: [PATCH 047/133] [ZARCH] Fix iamax/imax single precision --- kernel/zarch/icamax.c | 2 ++ kernel/zarch/icamin.c | 2 ++ kernel/zarch/isamax.c | 2 ++ kernel/zarch/isamin.c | 2 ++ kernel/zarch/ismax.c | 2 ++ kernel/zarch/ismin.c | 2 ++ 6 files changed, 12 insertions(+) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 27f969eee..2d1442ad9 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -248,6 +248,8 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index ae7b37b4f..79aa6d341 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -248,6 +248,8 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 2f5c1c867..6e0aaa162 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -216,6 +216,8 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 04e05aad9..266c48f7f 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -216,6 +216,8 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index 084b4ce94..c968ce6fa 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -192,6 +192,8 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v2,%%v0 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 4e85816a3..0145b31b3 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -192,6 +192,8 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) "j 2f \n\t" "1: \n\t" "wfchsb %%v4,%%v0,%%v2 \n\t" + "vesrlg %%v4,%%v4,32 \n\t" + "vsegf %%v4,%%v4 \n\t" "vsel %%v1,%%v3,%%v1,%%v4 \n\t" "vsel %%v0,%%v2,%%v0,%%v4 \n\t" "ste %%f0,%1 \n\t" From dc4d3bccd5ee7de7bb823aa0bb7008a04bcc21d4 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 03:47:49 +0200 Subject: [PATCH 048/133] [ZARCH] Fix icamax/icamin --- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 2d1442ad9..113c0cef5 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -94,7 +94,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 79aa6d341..5096b641b 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -94,7 +94,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vlef %%v18,48(%%r1,%3),2 \n\t" "vlef %%v19,52(%%r1,%3),2 \n\t" "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,30(%%r1,%3),3 \n\t" + "vlef %%v19,60(%%r1,%3),3 \n\t" "vlef %%v20,64(%%r1,%3),0 \n\t" "vlef %%v21,68(%%r1,%3),0 \n\t" From fcd814a8d292b7712a4230d9b9a20f0f2ce0fe52 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 29 Jan 2019 17:59:38 +0200 Subject: [PATCH 049/133] [ZARCH] Fix bug in max/min functions --- kernel/zarch/camax.c | 2 +- kernel/zarch/camin.c | 2 +- kernel/zarch/icamax.c | 2 +- kernel/zarch/icamin.c | 2 +- kernel/zarch/izamax.c | 2 +- kernel/zarch/izamin.c | 2 +- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 66d250896..f6fa772ac 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = camax_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 5abc685b2..4bd6ca17d 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -198,7 +198,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = camin_kernel_32(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 113c0cef5..a9e7f91fc 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = icamax_kernel_32(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 5096b641b..faf5f9c65 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -280,7 +280,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = icamin_kernel_32(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2ffad2570..2d1cc2365 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { max = izamax_kernel_16(n1, x, &maxf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 1e037c0c7..676fd7c6d 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -199,7 +199,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if (n1 > 0) { min = izamin_kernel_16(n1, x, &minf); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 8175874c0..b7214783f 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { maxf = zamax_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 5d57ff12e..d53fdb6b8 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -152,7 +152,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { if (n1 > 0) { minf = zamin_kernel_16(n1, x); - + ix = n1 * 2; i = n1; } else From eaf20f0e7ac8c2ab53deeb78f959bebb2a49cddd Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 09:26:50 +0200 Subject: [PATCH 050/133] Remove ztest --- ztest/Makefile | 437 ---------------------------------- ztest/amax.c | 235 ------------------ ztest/amin.c | 235 ------------------ ztest/asum.c | 263 -------------------- ztest/axpy.c | 303 ----------------------- ztest/copy.c | 291 ----------------------- ztest/dot.c | 296 ----------------------- ztest/dsdot.c | 229 ------------------ ztest/gemv.c | 633 ------------------------------------------------- ztest/iamax.c | 284 ---------------------- ztest/iamin.c | 284 ---------------------- ztest/imax.c | 231 ------------------ ztest/imin.c | 231 ------------------ ztest/max.c | 229 ------------------ ztest/min.c | 229 ------------------ ztest/rot.c | 303 ----------------------- ztest/scal.c | 308 ------------------------ ztest/swap.c | 306 ------------------------ 18 files changed, 5327 deletions(-) delete mode 100644 ztest/Makefile delete mode 100644 ztest/amax.c delete mode 100644 ztest/amin.c delete mode 100644 ztest/asum.c delete mode 100644 ztest/axpy.c delete mode 100644 ztest/copy.c delete mode 100644 ztest/dot.c delete mode 100644 ztest/dsdot.c delete mode 100644 ztest/gemv.c delete mode 100644 ztest/iamax.c delete mode 100644 ztest/iamin.c delete mode 100644 ztest/imax.c delete mode 100644 ztest/imin.c delete mode 100644 ztest/max.c delete mode 100644 ztest/min.c delete mode 100644 ztest/rot.c delete mode 100644 ztest/scal.c delete mode 100644 ztest/swap.c diff --git a/ztest/Makefile b/ztest/Makefile deleted file mode 100644 index 0ff7fe46a..000000000 --- a/ztest/Makefile +++ /dev/null @@ -1,437 +0,0 @@ -TOPDIR = .. -include $(TOPDIR)/Makefile.system - -goto :: sdot.goto ddot.goto cdot.goto zdot.goto dsdot.goto sswap.goto dswap.goto cswap.goto zswap.goto isamax.goto idamax.goto icamax.goto izamax.goto samax.goto damax.goto ismax.goto idmax.goto smax.goto dmax.goto isamin.goto idamin.goto icamin.goto izamin.goto samin.goto damin.goto camin.goto zamin.goto ismin.goto idmin.goto smin.goto dmin.goto sgemv.goto dgemv.goto cgemv.goto zgemv.goto sscal.goto dscal.goto cscal.goto zscal.goto saxpy.goto daxpy.goto caxpy.goto zaxpy.goto srot.goto drot.goto crot.goto zrot.goto sasum.goto dasum.goto casum.goto zasum.goto scopy.goto dcopy.goto ccopy.goto zcopy.goto - -##################################### Sdot #################################################### -sdot.goto : sdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ddot #################################################### -ddot.goto : ddot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cdot #################################################### -cdot.goto : cdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zdot #################################################### -zdot.goto : zdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dsdot #################################################### -dsdot.goto : dsdot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMAX ############################################## -isamax.goto : isamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMAX ############################################## -idamax.goto : idamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMAX ############################################## -icamax.goto : icamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMAX ############################################## -izamax.goto : izamax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMAX ############################################## -samax.goto : samax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMAX ############################################## -damax.goto : damax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMAX ############################################## -ismax.goto : ismax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMAX ############################################## -idmax.goto : idmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMAX ############################################## -smax.goto : smax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMAX ############################################## -dmax.goto : dmax.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISAMIN ############################################## -isamin.goto : isamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDAMIN ############################################## -idamin.goto : idamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ICAMIN ############################################## -icamin.goto : icamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IZAMIN ############################################## -izamin.goto : izamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SAMIN ############################################## -samin.goto : samin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DAMIN ############################################## -damin.goto : damin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## CAMIN ############################################## -camin.goto : camin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ZAMIN ############################################## -zamin.goto : zamin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## ISMIN ############################################## -ismin.goto : ismin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## IDMIN ############################################## -idmin.goto : idmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## SMIN ############################################## -smin.goto : smin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -############################################## DMIN ############################################## -dmin.goto : dmin.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sgemv #################################################### -sgemv.goto : sgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dgemv #################################################### -dgemv.goto : dgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cgemv #################################################### - -cgemv.goto : cgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zgemv #################################################### - -zgemv.goto : zgemv.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sscal #################################################### -sscal.goto : sscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dscal #################################################### -dscal.goto : dscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cscal #################################################### - -cscal.goto : cscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zscal #################################################### - -zscal.goto : zscal.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Srot #################################################### -srot.goto : srot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Drot #################################################### -drot.goto : drot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Crot #################################################### -crot.goto : crot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zrot #################################################### -zrot.goto : zrot.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sswap #################################################### -sswap.goto : sswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dswap #################################################### -dswap.goto : dswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Cswap #################################################### - -cswap.goto : cswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zswap #################################################### - -zswap.goto : zswap.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Saxpy #################################################### -saxpy.goto : saxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Daxpy #################################################### -daxpy.goto : daxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Caxpy #################################################### - -caxpy.goto : caxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zaxpy #################################################### - -zaxpy.goto : zaxpy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Sasum #################################################### -sasum.goto : sasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dasum #################################################### -dasum.goto : dasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Casum #################################################### - -casum.goto : casum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zasum #################################################### - -zasum.goto : zasum.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Scopy #################################################### -scopy.goto : scopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Dcopy #################################################### -dcopy.goto : dcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Ccopy #################################################### - -ccopy.goto : ccopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -##################################### Zcopy #################################################### - -zcopy.goto : zcopy.$(SUFFIX) ../$(LIBNAME) - $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm - -################################################################################################### - -sdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -ddot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zdot.$(SUFFIX) : dot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -dsdot.$(SUFFIX) : dsdot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -isamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamax.$(SUFFIX) : iamax.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damax.$(SUFFIX) : amax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ismax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmax.$(SUFFIX) : imax.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmax.$(SUFFIX) : max.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -isamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -icamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -izamin.$(SUFFIX) : iamin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -samin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -damin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -camin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zamin.$(SUFFIX) : amin.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -ismin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -idmin.$(SUFFIX) : imin.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -smin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dmin.$(SUFFIX) : min.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -sgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zgemv.$(SUFFIX) : gemv.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zscal.$(SUFFIX) : scal.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -srot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -drot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -crot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zrot.$(SUFFIX) : rot.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -cswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zswap.$(SUFFIX) : swap.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -saxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -daxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -caxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zaxpy.$(SUFFIX) : axpy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -sasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -casum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zasum.$(SUFFIX) : asum.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -scopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ - -dcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -UCOMPLEX -DDOUBLE -o $(@F) $^ - -ccopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -UDOUBLE -o $(@F) $^ - -zcopy.$(SUFFIX) : copy.c - $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ - -clean :: - @rm -f *.goto - diff --git a/ztest/amax.c b/ztest/amax.c deleted file mode 100644 index f2e3f5411..000000000 --- a/ztest/amax.c +++ /dev/null @@ -1,235 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef AMAX -#ifdef DOUBLE -#define AMAX BLASFUNC(damax) -#else -#define AMAX BLASFUNC(samax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -FLOAT amin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef AMIN -#ifdef DOUBLE -#define AMIN BLASFUNC(damin) -#else -#define AMIN BLASFUNC(samin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -FLOAT zasum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CABS1(x,i); - i += inc_x2; - } - return(sumf); -} -#else -FLOAT asum_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += ABS(x[i]); - i += inc_x; - } - return(sumf); -} -#endif - -#undef ASUM -#ifdef COMPLEX -#ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) -#else -#define ASUM BLASFUNC(scasum) -#endif -#else -#ifdef DOUBLE -#define ASUM BLASFUNC(dasum) -#else -#define ASUM BLASFUNC(sasum) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zaxpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - if ( da_r == 0.0 && da_i == 0.0 ) return(0); - - ix = 0; - iy = 0; - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { -#if !defined(CONJ) - y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; - y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; -#else - y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; - y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); - -} -#else -int axpy_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix,iy; - - if ( n < 0 ) return(0); - if ( da == 0.0 ) return(0); - - ix = 0; - iy = 0; - - while(i < n) - { - - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef AXPY -#ifdef COMPLEX -#ifdef DOUBLE -#define AXPY BLASFUNC(zaxpy) -#else -#define AXPY BLASFUNC(caxpy) -#endif -#else -#ifdef DOUBLE -#define AXPY BLASFUNC(daxpy) -#else -#define AXPY BLASFUNC(saxpy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c;; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - argc--;argv++; - - blasint iy; - int test = 1; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zcopy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2; - iy += inc_y2; - i++ ; - - } - return(0); - -} -#else -int copy_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - if ( n < 0 ) return(0); - - while(i < n) - { - - y[iy] = x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); - -} -#endif - -#undef COPY -#ifdef COMPLEX -#ifdef DOUBLE -#define COPY BLASFUNC(zcopy) -#else -#define COPY BLASFUNC(ccopy) -#endif -#else -#ifdef DOUBLE -#define COPY BLASFUNC(dcopy) -#else -#define COPY BLASFUNC(scopy) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -OPENBLAS_COMPLEX_FLOAT zdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot[2]; - OPENBLAS_COMPLEX_FLOAT result; - BLASLONG inc_x2; - BLASLONG inc_y2; - - dot[0]=0.0; - dot[1]=0.0; - - CREAL(result) = 0.0 ; - CIMAG(result) = 0.0 ; - - if ( n < 1 ) return(result); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { -#if !defined(CONJ) - dot[0] += ( x[ix] * y[iy] - x[ix+1] * y[iy+1] ) ; - dot[1] += ( x[ix+1] * y[iy] + x[ix] * y[iy+1] ) ; -#else - dot[0] += ( x[ix] * y[iy] + x[ix+1] * y[iy+1] ) ; - dot[1] -= ( x[ix+1] * y[iy] - x[ix] * y[iy+1] ) ; -#endif - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - CREAL(result) = dot[0]; - CIMAG(result) = dot[1]; - return(result); - -} -#else -FLOAT dot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} -#endif - -#undef DOT -#ifdef COMPLEX -#ifdef DOUBLE -#define DOT BLASFUNC(zdotu) -#else -#define DOT BLASFUNC(cdotu) -#endif -#else -#ifdef DOUBLE -#define DOT BLASFUNC(ddot) -#else -#define DOT BLASFUNC(sdot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; -#ifdef COMPLEX - OPENBLAS_COMPLEX_FLOAT result, result_c; -#else - FLOAT result, result_c; -#endif - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -double dsdot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - double dot = 0.0 ; - - if ( n < 0 ) return(dot); - - while(i < n) - { - - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); -} - -#undef DSDOT -#define DSDOT BLASFUNC(dsdot) - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y; - double result, result_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zgemv_n_c(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i; - BLASLONG ix, iy; - BLASLONG j; - FLOAT *a_ptr; - FLOAT temp_r, temp_i; - BLASLONG inc_x2, inc_y2; - BLASLONG lda2; - BLASLONG i2; - - lda2 = 2 * lda; - - ix = 0; - a_ptr = a; - - if (inc_x == 1 && inc_y == 1) - { - - for (j = 0; jtv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *a, *x, *y, *y_c; - FLOAT alpha[] = {1.0, 1.0}; - FLOAT beta [] = {1.0, 0.0}; - char trans='N'; - blasint m, i, j; - blasint inc_x=1,inc_y=1; - blasint n=0; - int has_param_n = 0; - int has_param_m = 0; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint y_size; - blasint iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - - int tomax = to; - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; - if ((p = getenv("OPENBLAS_PARAM_N"))) { - n = atoi(p); - if ((n>0)) has_param_n = 1; - if ( n > tomax ) tomax = n; - } - if ( has_param_n == 0 ) - if ((p = getenv("OPENBLAS_PARAM_M"))) { - m = atoi(p); - if ((m>0)) has_param_m = 1; - if ( m > tomax ) tomax = m; - } - - - - fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); - - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - if (has_param_m == 0) - { - - for(m = from; m <= to; m += step) - { - timeg=0; - timeg_c=0; - if ( has_param_n == 0 ) n = m; - fprintf(stderr, " %6dx%d :", (int)m,(int)n); - for(j = 0; j < m; j++){ - for(i = 0; i < n * COMPSIZE; i++){ - a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; - } - } - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf; - BLASLONG max=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(max); - - inc_x2 = 2 * inc_x; - - maxf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(max+1); -} -#else -BLASLONG iamax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) > maxf ) - { - max = i; - maxf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(max+1); -} -#endif - -#undef IAMAX -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMAX BLASFUNC(izamax) -#else -#define IAMAX BLASFUNC(icamax) -#endif -#else -#ifdef DOUBLE -#define IAMAX BLASFUNC(idamax) -#else -#define IAMAX BLASFUNC(isamax) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#ifdef COMPLEX -#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -BLASLONG izamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf; - BLASLONG min=0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(min); - - inc_x2 = 2 * inc_x; - - minf = CABS1(x,0); - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return(min+1); -} -#else -BLASLONG iamin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=ABS(x[0]); - ix += inc_x; - i++; - - while(i < n) - { - if( ABS(x[ix]) < minf ) - { - min = i; - minf = ABS(x[ix]); - } - ix += inc_x; - i++; - } - return(min+1); -} -#endif - -#undef IAMIN -#ifdef COMPLEX -#ifdef DOUBLE -#define IAMIN BLASFUNC(izamin) -#else -#define IAMIN BLASFUNC(icamin) -#endif -#else -#ifdef DOUBLE -#define IAMIN BLASFUNC(idamin) -#else -#define IAMIN BLASFUNC(isamin) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imax_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - BLASLONG max=0; - - if (n <= 0 || inc_x <= 0) return(max); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - max = i; - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(max+1); -} - -#undef IMAX -#ifdef DOUBLE -#define IMAX BLASFUNC(idmax) -#else -#define IMAX BLASFUNC(ismax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -BLASLONG imin_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - BLASLONG min=0; - - if (n <= 0 || inc_x <= 0) return(min); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - min = i; - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(min+1); -} - -#undef IMIN -#ifdef DOUBLE -#define IMIN BLASFUNC(idmin) -#else -#define IMIN BLASFUNC(ismin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - BLASLONG result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT max_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT maxf=0.0; - - if (n <= 0 || inc_x <= 0) return(maxf); - - maxf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] > maxf ) - { - maxf = x[ix]; - } - ix += inc_x; - i++; - } - return(maxf); -} - -#undef MAX_ -#ifdef DOUBLE -#define MAX_ BLASFUNC(dmax) -#else -#define MAX_ BLASFUNC(smax) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -FLOAT min_c(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ix=0; - FLOAT minf=0.0; - - if (n <= 0 || inc_x <= 0) return(minf); - - minf=x[0]; - ix += inc_x; - i++; - - while(i < n) - { - if( x[ix] < minf ) - { - minf = x[ix]; - } - ix += inc_x; - i++; - } - return(minf); -} - -#undef MIN_ -#ifdef DOUBLE -#define MIN_ BLASFUNC(dmin) -#else -#define MIN_ BLASFUNC(smin) -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x; - FLOAT result, result_c; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zrot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int rot_c(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef ROT -#ifdef COMPLEX -#ifdef DOUBLE -#define ROT BLASFUNC(zdrot) -#else -#define ROT BLASFUNC(csrot) -#endif -#else -#ifdef DOUBLE -#define ROT BLASFUNC(drot) -#else -#define ROT BLASFUNC(srot) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - // FLOAT result; - blasint m, i; - blasint inc_x=1,inc_y=1; - FLOAT c[1] = { 2.0 }; - FLOAT s[1] = { 2.0 }; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zscal_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG inc_x2; - BLASLONG ip = 0; - FLOAT temp; - - if ( (n <= 0) || (inc_x <= 0)) - return(0); - - inc_x2 = 2 * inc_x; - for ( i=0; itv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *x_c; - FLOAT alpha[2] = { 2.0, 2.0 }; - blasint m, i; - blasint inc_x=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - -#define SINGLE_EPS 1e-04 -#define DOUBLE_EPS 1e-13 - -int assert_dbl_near(double exp, double real, double tol) { - double diff = exp - real; - double absdiff = diff; - /* avoid using fabs and linking with a math lib */ - if(diff < 0) { - absdiff *= -1; - } - if (absdiff > tol) { - return 0; - } - return 1; -} - -#ifdef COMPLEX -int zswap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n < 0 ) return(0); - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - - } - return(0); -} -#else -int swap_c(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n < 0 ) return(0); - - while(i < n) - { - - temp = x[ix] ; - x[ix] = y[iy] ; - y[iy] = temp ; - - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(0); -} -#endif - -#undef SWAP -#ifdef COMPLEX -#ifdef DOUBLE -#define SWAP BLASFUNC(zswap) -#else -#define SWAP BLASFUNC(cswap) -#endif -#else -#ifdef DOUBLE -#define SWAP BLASFUNC(dswap) -#else -#define SWAP BLASFUNC(sswap) -#endif -#endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ - - FLOAT *x, *y, *x_c, *y_c; - blasint m, i; - blasint inc_x=1,inc_y=1; - int loops = 1; - int l; - char *p; - - int from = 1; - int to = 200; - int step = 1; - - struct timeval start, stop; - double time1,timeg,timeg_c; - - blasint ix,iy; - int test = 1; - - argc--;argv++; - - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,inc_x,inc_y,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( x_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - - if (( y_c = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); - } - -#ifdef linux - srandom(getpid()); -#endif - - fprintf(stderr, " SIZE Flops Time CTime Test\n"); - - for(m = from; m <= to; m += step) - { - - timeg=0; - timeg_c=0; - - fprintf(stderr, " %6d :", (int)m); - - - for (l=0; l Date: Thu, 31 Jan 2019 15:25:15 +0100 Subject: [PATCH 051/133] Fix wrong comparison that made IMIN identical to IMAX as suggested in #1990 --- kernel/arm/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index 598cba387..ffc65226e 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 86a824c97f1f4ccfe8b24678dc0fdaf4846a7055 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 15:27:21 +0100 Subject: [PATCH 052/133] Fix wrong comparison that made IMIN identical to IMAX as reported by aarnez in #1990 --- kernel/mips/imin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c index d9b283d2d..bf130613b 100644 --- a/kernel/mips/imin.c +++ b/kernel/mips/imin.c @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] > minf ) + if( x[ix] < minf ) { min = i; minf = x[ix]; From 48b9b94f7f7d1856babac7f20f7e9d90fa8750d0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 18:52:11 +0200 Subject: [PATCH 053/133] [ZARCH] Improve loading performance for camax/icamax --- kernel/zarch/camax.c | 128 ++++++++++++++++++------------------------ kernel/zarch/camin.c | 128 ++++++++++++++++++------------------------ kernel/zarch/icamax.c | 114 ++++++++++++++++--------------------- kernel/zarch/icamin.c | 114 ++++++++++++++++--------------------- kernel/zarch/zamax.c | 2 +- kernel/zarch/zamin.c | 2 +- 6 files changed, 212 insertions(+), 276 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index f6fa772ac..2e9648640 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -52,82 +52,66 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 4bd6ca17d..aec59058e 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -52,82 +52,66 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "vflpsb %%v0,%%v0 \n\t" "vflpsb %%v16,%%v16 \n\t" "vfasb %%v0,%%v0,%%v16 \n\t" + "vleib %%v1,0,0 \n\t" + "vleib %%v1,1,1 \n\t" + "vleib %%v1,2,2 \n\t" + "vleib %%v1,3,3 \n\t" + "vleib %%v1,8,4 \n\t" + "vleib %%v1,9,5 \n\t" + "vleib %%v1,10,6 \n\t" + "vleib %%v1,11,7 \n\t" + "vleib %%v1,16,8 \n\t" + "vleib %%v1,17,9 \n\t" + "vleib %%v1,18,10 \n\t" + "vleib %%v1,19,11 \n\t" + "vleib %%v1,24,12 \n\t" + "vleib %%v1,25,13 \n\t" + "vleib %%v1,26,14 \n\t" + "vleib %%v1,27,15 \n\t" "srlg %%r0,%1,5 \n\t" "xgr %%r1,%%r1 \n\t" "0: \n\t" "pfd 1, 1024(%%r1,%2) \n\t" - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v17,4(%%r1,%2),0 \n\t" - "vlef %%v16,8(%%r1,%2),1 \n\t" - "vlef %%v17,12(%%r1,%2),1 \n\t" - "vlef %%v16,16(%%r1,%2),2 \n\t" - "vlef %%v17,20(%%r1,%2),2 \n\t" - "vlef %%v16,24(%%r1,%2),3 \n\t" - "vlef %%v17,28(%%r1,%2),3 \n\t" - - "vlef %%v18,32(%%r1,%2),0 \n\t" - "vlef %%v19,36(%%r1,%2),0 \n\t" - "vlef %%v18,40(%%r1,%2),1 \n\t" - "vlef %%v19,44(%%r1,%2),1 \n\t" - "vlef %%v18,48(%%r1,%2),2 \n\t" - "vlef %%v19,52(%%r1,%2),2 \n\t" - "vlef %%v18,56(%%r1,%2),3 \n\t" - "vlef %%v19,30(%%r1,%2),3 \n\t" - - "vlef %%v20,64(%%r1,%2),0 \n\t" - "vlef %%v21,68(%%r1,%2),0 \n\t" - "vlef %%v20,72(%%r1,%2),1 \n\t" - "vlef %%v21,76(%%r1,%2),1 \n\t" - "vlef %%v20,80(%%r1,%2),2 \n\t" - "vlef %%v21,84(%%r1,%2),2 \n\t" - "vlef %%v20,88(%%r1,%2),3 \n\t" - "vlef %%v21,92(%%r1,%2),3 \n\t" - - "vlef %%v22,96(%%r1,%2),0 \n\t" - "vlef %%v23,100(%%r1,%2),0 \n\t" - "vlef %%v22,104(%%r1,%2),1 \n\t" - "vlef %%v23,108(%%r1,%2),1 \n\t" - "vlef %%v22,112(%%r1,%2),2 \n\t" - "vlef %%v23,116(%%r1,%2),2 \n\t" - "vlef %%v22,120(%%r1,%2),3 \n\t" - "vlef %%v23,124(%%r1,%2),3 \n\t" - - "vlef %%v24,128(%%r1,%2),0 \n\t" - "vlef %%v25,132(%%r1,%2),0 \n\t" - "vlef %%v24,136(%%r1,%2),1 \n\t" - "vlef %%v25,140(%%r1,%2),1 \n\t" - "vlef %%v24,144(%%r1,%2),2 \n\t" - "vlef %%v25,148(%%r1,%2),2 \n\t" - "vlef %%v24,152(%%r1,%2),3 \n\t" - "vlef %%v25,156(%%r1,%2),3 \n\t" - - "vlef %%v26,160(%%r1,%2),0 \n\t" - "vlef %%v27,164(%%r1,%2),0 \n\t" - "vlef %%v26,168(%%r1,%2),1 \n\t" - "vlef %%v27,172(%%r1,%2),1 \n\t" - "vlef %%v26,176(%%r1,%2),2 \n\t" - "vlef %%v27,180(%%r1,%2),2 \n\t" - "vlef %%v26,184(%%r1,%2),3 \n\t" - "vlef %%v27,188(%%r1,%2),3 \n\t" - - "vlef %%v28,192(%%r1,%2),0 \n\t" - "vlef %%v29,196(%%r1,%2),0 \n\t" - "vlef %%v28,200(%%r1,%2),1 \n\t" - "vlef %%v29,204(%%r1,%2),1 \n\t" - "vlef %%v28,208(%%r1,%2),2 \n\t" - "vlef %%v29,212(%%r1,%2),2 \n\t" - "vlef %%v28,216(%%r1,%2),3 \n\t" - "vlef %%v29,220(%%r1,%2),3 \n\t" - - "vlef %%v30,224(%%r1,%2),0 \n\t" - "vlef %%v31,228(%%r1,%2),0 \n\t" - "vlef %%v30,232(%%r1,%2),1 \n\t" - "vlef %%v31,236(%%r1,%2),1 \n\t" - "vlef %%v30,240(%%r1,%2),2 \n\t" - "vlef %%v31,244(%%r1,%2),2 \n\t" - "vlef %%v30,248(%%r1,%2),3 \n\t" - "vlef %%v31,252(%%r1,%2),3 \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v2,16(%%r1,%2) \n\t" + "vpkg %%v17,%%v16,%%v2 \n\t" + "vperm %%v16,%%v16,%%v2,%%v1 \n\t" + + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v2,48(%%r1,%2) \n\t" + "vpkg %%v19,%%v18,%%v2 \n\t" + "vperm %%v18,%%v18,%%v2,%%v1 \n\t" + + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v2,80(%%r1,%2) \n\t" + "vpkg %%v21,%%v20,%%v2 \n\t" + "vperm %%v20,%%v20,%%v2,%%v1 \n\t" + + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v2,112(%%r1,%2) \n\t" + "vpkg %%v23,%%v22,%%v2 \n\t" + "vperm %%v22,%%v22,%%v2,%%v1 \n\t" + + "vl %%v24,128(%%r1,%2) \n\t" + "vl %%v2,144(%%r1,%2) \n\t" + "vpkg %%v25,%%v24,%%v2 \n\t" + "vperm %%v24,%%v24,%%v2,%%v1 \n\t" + + "vl %%v26,160(%%r1,%2) \n\t" + "vl %%v2,176(%%r1,%2) \n\t" + "vpkg %%v27,%%v26,%%v2 \n\t" + "vperm %%v26,%%v26,%%v2,%%v1 \n\t" + + "vl %%v28,192(%%r1,%2) \n\t" + "vl %%v2,208(%%r1,%2) \n\t" + "vpkg %%v29,%%v28,%%v2 \n\t" + "vperm %%v28,%%v28,%%v2,%%v1 \n\t" + + "vl %%v30,224(%%r1,%2) \n\t" + "vl %%v2,240(%%r1,%2) \n\t" + "vpkg %%v31,%%v30,%%v2 \n\t" + "vperm %%v30,%%v30,%%v2,%%v1 \n\t" "vflpsb %%v16,%%v16 \n\t" "vflpsb %%v17,%%v17 \n\t" @@ -178,7 +162,7 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) "ler %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index a9e7f91fc..5129ca6ee 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -57,6 +57,22 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) "nop " :"=r"(iamax),"=m"(*amax) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamax; diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index faf5f9c65..05068b212 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -57,6 +57,22 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vleig %%v2,3,1 \n\t" "vrepig %%v3,16 \n\t" "vzero %%v4 \n\t" + "vleib %%v9,0,0 \n\t" + "vleib %%v9,1,1 \n\t" + "vleib %%v9,2,2 \n\t" + "vleib %%v9,3,3 \n\t" + "vleib %%v9,8,4 \n\t" + "vleib %%v9,9,5 \n\t" + "vleib %%v9,10,6 \n\t" + "vleib %%v9,11,7 \n\t" + "vleib %%v9,16,8 \n\t" + "vleib %%v9,17,9 \n\t" + "vleib %%v9,18,10 \n\t" + "vleib %%v9,19,11 \n\t" + "vleib %%v9,24,12 \n\t" + "vleib %%v9,25,13 \n\t" + "vleib %%v9,26,14 \n\t" + "vleib %%v9,27,15 \n\t" "vleif %%v24,0,0 \n\t" "vleif %%v24,1,1 \n\t" "vleif %%v24,2,2 \n\t" @@ -78,41 +94,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "0: \n\t" "pfd 1, 1024(%%r1,%3) \n\t" - "vlef %%v16,0(%%r1,%3),0 \n\t" - "vlef %%v17,4(%%r1,%3),0 \n\t" - "vlef %%v16,8(%%r1,%3),1 \n\t" - "vlef %%v17,12(%%r1,%3),1 \n\t" - "vlef %%v16,16(%%r1,%3),2 \n\t" - "vlef %%v17,20(%%r1,%3),2 \n\t" - "vlef %%v16,24(%%r1,%3),3 \n\t" - "vlef %%v17,28(%%r1,%3),3 \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v28,16(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,32(%%r1,%3),0 \n\t" - "vlef %%v19,36(%%r1,%3),0 \n\t" - "vlef %%v18,40(%%r1,%3),1 \n\t" - "vlef %%v19,44(%%r1,%3),1 \n\t" - "vlef %%v18,48(%%r1,%3),2 \n\t" - "vlef %%v19,52(%%r1,%3),2 \n\t" - "vlef %%v18,56(%%r1,%3),3 \n\t" - "vlef %%v19,60(%%r1,%3),3 \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v29,48(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,64(%%r1,%3),0 \n\t" - "vlef %%v21,68(%%r1,%3),0 \n\t" - "vlef %%v20,72(%%r1,%3),1 \n\t" - "vlef %%v21,76(%%r1,%3),1 \n\t" - "vlef %%v20,80(%%r1,%3),2 \n\t" - "vlef %%v21,84(%%r1,%3),2 \n\t" - "vlef %%v20,88(%%r1,%3),3 \n\t" - "vlef %%v21,92(%%r1,%3),3 \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,96(%%r1,%3),0 \n\t" - "vlef %%v23,100(%%r1,%3),0 \n\t" - "vlef %%v22,104(%%r1,%3),1 \n\t" - "vlef %%v23,108(%%r1,%3),1 \n\t" - "vlef %%v22,112(%%r1,%3),2 \n\t" - "vlef %%v23,116(%%r1,%3),2 \n\t" - "vlef %%v22,120(%%r1,%3),3 \n\t" - "vlef %%v23,124(%%r1,%3),3 \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v31,112(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -151,41 +151,25 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "vsel %%v2,%%v2,%%v6,%%v8 \n\t" "vag %%v4,%%v4,%%v3 \n\t" - "vlef %%v16,128(%%r1,%3),0 \n\t" - "vlef %%v17,132(%%r1,%3),0 \n\t" - "vlef %%v16,136(%%r1,%3),1 \n\t" - "vlef %%v17,140(%%r1,%3),1 \n\t" - "vlef %%v16,144(%%r1,%3),2 \n\t" - "vlef %%v17,148(%%r1,%3),2 \n\t" - "vlef %%v16,152(%%r1,%3),3 \n\t" - "vlef %%v17,156(%%r1,%3),3 \n\t" + "vl %%v16,128(%%r1,%3) \n\t" + "vl %%v28,144(%%r1,%3) \n\t" + "vpkg %%v17,%%v16,%%v28 \n\t" + "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - "vlef %%v18,160(%%r1,%3),0 \n\t" - "vlef %%v19,164(%%r1,%3),0 \n\t" - "vlef %%v18,168(%%r1,%3),1 \n\t" - "vlef %%v19,172(%%r1,%3),1 \n\t" - "vlef %%v18,176(%%r1,%3),2 \n\t" - "vlef %%v19,180(%%r1,%3),2 \n\t" - "vlef %%v18,184(%%r1,%3),3 \n\t" - "vlef %%v19,188(%%r1,%3),3 \n\t" + "vl %%v18,160(%%r1,%3) \n\t" + "vl %%v29,176(%%r1,%3) \n\t" + "vpkg %%v19,%%v18,%%v29 \n\t" + "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - "vlef %%v20,192(%%r1,%3),0 \n\t" - "vlef %%v21,196(%%r1,%3),0 \n\t" - "vlef %%v20,200(%%r1,%3),1 \n\t" - "vlef %%v21,204(%%r1,%3),1 \n\t" - "vlef %%v20,208(%%r1,%3),2 \n\t" - "vlef %%v21,212(%%r1,%3),2 \n\t" - "vlef %%v20,216(%%r1,%3),3 \n\t" - "vlef %%v21,220(%%r1,%3),3 \n\t" + "vl %%v20,192(%%r1,%3) \n\t" + "vl %%v30,208(%%r1,%3) \n\t" + "vpkg %%v21,%%v20,%%v30 \n\t" + "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - "vlef %%v22,224(%%r1,%3),0 \n\t" - "vlef %%v23,228(%%r1,%3),0 \n\t" - "vlef %%v22,232(%%r1,%3),1 \n\t" - "vlef %%v23,236(%%r1,%3),1 \n\t" - "vlef %%v22,240(%%r1,%3),2 \n\t" - "vlef %%v23,244(%%r1,%3),2 \n\t" - "vlef %%v22,248(%%r1,%3),3 \n\t" - "vlef %%v23,252(%%r1,%3),3 \n\t" + "vl %%v22,224(%%r1,%3) \n\t" + "vl %%v31,240(%%r1,%3) \n\t" + "vpkg %%v23,%%v22,%%v31 \n\t" + "vperm %%v22,%%v22,%%v31,%%v9 \n\t" "vflpsb %%v16, %%v16 \n\t" "vflpsb %%v17, %%v17 \n\t" @@ -258,7 +242,7 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) "nop " :"=r"(iamin),"=m"(*amin) :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return iamin; diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index b7214783f..cc6347127 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -132,7 +132,7 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amax) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amax; diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index d53fdb6b8..18610daea 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -132,7 +132,7 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) "ldr %0,%%f0 " :"=f"(amin) :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" ); return amin; From 29416cb5a37b990052d019f66736af5263a81809 Mon Sep 17 00:00:00 2001 From: maamountki Date: Thu, 31 Jan 2019 19:11:11 +0200 Subject: [PATCH 054/133] [ZARCH] Add Z13 version for max/min functions --- kernel/zarch/KERNEL.Z13 | 12 +-- kernel/zarch/damax_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/damin_z13.c | 204 ++++++++++++++++++++++++++++++++++++ kernel/zarch/dmax_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/dmin_z13.c | 180 +++++++++++++++++++++++++++++++ kernel/zarch/zamax_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ kernel/zarch/zamin_z13.c | 221 +++++++++++++++++++++++++++++++++++++++ 7 files changed, 1216 insertions(+), 6 deletions(-) create mode 100644 kernel/zarch/damax_z13.c create mode 100644 kernel/zarch/damin_z13.c create mode 100644 kernel/zarch/dmax_z13.c create mode 100644 kernel/zarch/dmin_z13.c create mode 100644 kernel/zarch/zamax_z13.c create mode 100644 kernel/zarch/zamin_z13.c diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index e5b974ab4..22c7e9703 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = damax.c +DAMAXKERNEL = damax_z13.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = zamax.c +ZAMAXKERNEL = zamax_z13.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = damin.c +DAMINKERNEL = damin_z13.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = zamin.c +ZAMINKERNEL = zamin_z13.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = dmax.c +DMAXKERNEL = dmax_z13.c SMINKERNEL = ../arm/min.c -DMINKERNEL = dmin.c +DMINKERNEL = dmin_z13.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c new file mode 100644 index 000000000..95b94ee4a --- /dev/null +++ b/kernel/zarch/damax_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = damax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); + + } else { + + maxf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c new file mode 100644 index 000000000..538690ee5 --- /dev/null +++ b/kernel/zarch/damin_z13.c @@ -0,0 +1,204 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "vflpdb %%v0,%%v0 \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = damin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); + + } else { + + minf=ABS(x[0]); + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c new file mode 100644 index 000000000..83e7b02a8 --- /dev/null +++ b/kernel/zarch/dmax_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT max; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vfchdb %%v26,%%v20,%%v21 \n\t" + "vfchdb %%v27,%%v22,%%v23 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v24,%%v25 \n\t" + "vfchdb %%v29,%%v26,%%v27 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v28,%%v29 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v30,%%v0 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(max) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return max; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + maxf = dmax_kernel_32(n1, x); + + i = n1; + } + else + { + maxf=x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { + + maxf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; + } + return (maxf); + } +} diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c new file mode 100644 index 000000000..e64f90ee3 --- /dev/null +++ b/kernel/zarch/dmin_z13.c @@ -0,0 +1,180 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) +{ + FLOAT min; + + __asm__ volatile ( + "vl %%v0,0(%2) \n\t" + "srlg %%r0,%1,5 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "vl %%v16,128(%%r1,%2) \n\t" + "vl %%v17,144(%%r1,%2) \n\t" + "vl %%v18,160(%%r1,%2) \n\t" + "vl %%v19,176(%%r1,%2) \n\t" + "vl %%v20,192(%%r1,%2) \n\t" + "vl %%v21,208(%%r1,%2) \n\t" + "vl %%v22,224(%%r1,%2) \n\t" + "vl %%v23,240(%%r1,%2) \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vfchdb %%v26,%%v21,%%v20 \n\t" + "vfchdb %%v27,%%v23,%%v22 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + "vsel %%v26,%%v20,%%v21,%%v26 \n\t" + "vsel %%v27,%%v22,%%v23,%%v27 \n\t" + + "vfchdb %%v28,%%v25,%%v24 \n\t" + "vfchdb %%v29,%%v27,%%v26 \n\t" + "vsel %%v28,%%v24,%%v25,%%v28 \n\t" + "vsel %%v29,%%v26,%%v27,%%v29 \n\t" + + "vfchdb %%v30,%%v29,%%v28 \n\t" + "vsel %%v30,%%v28,%%v29,%%v30 \n\t" + + "vfchdb %%v31,%%v0,%%v30 \n\t" + "vsel %%v0,%%v30,%%v0,%%v31 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(min) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return min; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + minf = dmin_kernel_32(n1, x); + + i = n1; + } + else + { + minf=x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { + + minf=x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; + } + return (minf); + } +} diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c new file mode 100644 index 000000000..ae711c173 --- /dev/null +++ b/kernel/zarch/zamax_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amax; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v16,%%v17 \n\t" + "vfchdb %%v25,%%v18,%%v19 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v24,%%v25 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v26,%%v0 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v0,%%v16 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amax) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amax; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + maxf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) > maxf) { + maxf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) > maxf) { + maxf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) > maxf) { + maxf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) > maxf) { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (maxf); + } +} diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c new file mode 100644 index 000000000..f82c57e81 --- /dev/null +++ b/kernel/zarch/zamin_z13.c @@ -0,0 +1,221 @@ +/*************************************************************************** +Copyright (c) 2013-2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + +#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) +{ + FLOAT amin; + + __asm__ volatile ( + "vleg %%v0,0(%2),0 \n\t" + "vleg %%v16,8(%2),0 \n\t" + "vleg %%v0,16(%2),1 \n\t" + "vleg %%v16,24(%2),1 \n\t" + "vflpdb %%v0,%%v0 \n\t" + "vflpdb %%v16,%%v16 \n\t" + "vfadb %%v0,%%v0,%%v16 \n\t" + "srlg %%r0,%1,4 \n\t" + "xgr %%r1,%%r1 \n\t" + "0: \n\t" + "pfd 1, 1024(%%r1,%2) \n\t" + + "vleg %%v16,0(%%r1,%2),0 \n\t" + "vleg %%v17,8(%%r1,%2),0 \n\t" + "vleg %%v16,16(%%r1,%2),1 \n\t" + "vleg %%v17,24(%%r1,%2),1 \n\t" + "vleg %%v18,32(%%r1,%2),0 \n\t" + "vleg %%v19,40(%%r1,%2),0 \n\t" + "vleg %%v18,48(%%r1,%2),1 \n\t" + "vleg %%v19,56(%%r1,%2),1 \n\t" + "vleg %%v20,64(%%r1,%2),0 \n\t" + "vleg %%v21,72(%%r1,%2),0 \n\t" + "vleg %%v20,80(%%r1,%2),1 \n\t" + "vleg %%v21,88(%%r1,%2),1 \n\t" + "vleg %%v22,96(%%r1,%2),0 \n\t" + "vleg %%v23,104(%%r1,%2),0 \n\t" + "vleg %%v22,112(%%r1,%2),1 \n\t" + "vleg %%v23,120(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "vleg %%v16,128(%%r1,%2),0 \n\t" + "vleg %%v17,136(%%r1,%2),0 \n\t" + "vleg %%v16,144(%%r1,%2),1 \n\t" + "vleg %%v17,152(%%r1,%2),1 \n\t" + "vleg %%v18,160(%%r1,%2),0 \n\t" + "vleg %%v19,168(%%r1,%2),0 \n\t" + "vleg %%v18,176(%%r1,%2),1 \n\t" + "vleg %%v19,184(%%r1,%2),1 \n\t" + "vleg %%v20,192(%%r1,%2),0 \n\t" + "vleg %%v21,200(%%r1,%2),0 \n\t" + "vleg %%v20,208(%%r1,%2),1 \n\t" + "vleg %%v21,216(%%r1,%2),1 \n\t" + "vleg %%v22,224(%%r1,%2),0 \n\t" + "vleg %%v23,232(%%r1,%2),0 \n\t" + "vleg %%v22,240(%%r1,%2),1 \n\t" + "vleg %%v23,248(%%r1,%2),1 \n\t" + "vflpdb %%v16, %%v16 \n\t" + "vflpdb %%v17, %%v17 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vflpdb %%v19, %%v19 \n\t" + "vflpdb %%v20, %%v20 \n\t" + "vflpdb %%v21, %%v21 \n\t" + "vflpdb %%v22, %%v22 \n\t" + "vflpdb %%v23, %%v23 \n\t" + "vfadb %%v16,%%v16,%%v17 \n\t" + "vfadb %%v17,%%v18,%%v19 \n\t" + "vfadb %%v18,%%v20,%%v21 \n\t" + "vfadb %%v19,%%v22,%%v23 \n\t" + + "vfchdb %%v24,%%v17,%%v16 \n\t" + "vfchdb %%v25,%%v19,%%v18 \n\t" + "vsel %%v24,%%v16,%%v17,%%v24 \n\t" + "vsel %%v25,%%v18,%%v19,%%v25 \n\t" + + "vfchdb %%v26,%%v25,%%v24 \n\t" + "vsel %%v26,%%v24,%%v25,%%v26 \n\t" + + "vfchdb %%v27,%%v0,%%v26 \n\t" + "vsel %%v0,%%v26,%%v0,%%v27 \n\t" + + "agfi %%r1, 256 \n\t" + "brctg %%r0, 0b \n\t" + + "vrepg %%v16,%%v0,1 \n\t" + "wfchdb %%v17,%%v16,%%v0 \n\t" + "vsel %%v0,%%v0,%%v16,%%v17 \n\t" + "ldr %0,%%f0 " + :"=f"(amin) + :"r"(n),"ZR"((const FLOAT (*)[n])x) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" + ); + + return amin; +} + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; + } + else + { + minf=CABS1(x,0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (minf); + + } else { + + minf=CABS1(x,0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + if (CABS1(x,ix+inc_x2) < minf) { + minf = CABS1(x,ix+inc_x2); + } + if (CABS1(x,ix+inc_x2*2) < minf) { + minf = CABS1(x,ix+inc_x2*2); + } + if (CABS1(x,ix+inc_x2*3) < minf) { + minf = CABS1(x,ix+inc_x2*3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + + while (i < n) { + if (CABS1(x,ix) < minf) { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (minf); + } +} From 1249ee1fd0e62f5386b8b5dbce7b3d5fac785006 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:13:46 +0100 Subject: [PATCH 055/133] Add Z14 target from patch provided by aarnez in #991 --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3d04a57cf..3a5a32234 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -94,3 +94,4 @@ THUNDERX2T99 9.System Z: ZARCH_GENERIC Z13 +Z14 From bdc73a49e0e3fe375fe2a015abebc962e29d72af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:14:37 +0100 Subject: [PATCH 056/133] Add parameters for Z14 from patch provided by aarnez in #991 --- param.h | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/param.h b/param.h index 15ea663a8..3cc400b54 100644 --- a/param.h +++ b/param.h @@ -2915,6 +2915,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#if defined(Z14) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 + +#define SGEMM_DEFAULT_P 456 +#define DGEMM_DEFAULT_P 320 +#define CGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 224 + +#define SGEMM_DEFAULT_Q 488 +#define DGEMM_DEFAULT_Q 384 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 352 + +#define SGEMM_DEFAULT_R 8192 +#define DGEMM_DEFAULT_R 4096 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 2048 + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC From 72d3e7c9b49af5c13ff1e26d13fc3b35ffd92076 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:15:50 +0100 Subject: [PATCH 057/133] Add FORCE Z14 from patch provided by aarnez in #991 --- getarch.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/getarch.c b/getarch.c index d03ce6e98..242d08004 100644 --- a/getarch.c +++ b/getarch.c @@ -1085,6 +1085,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z13" #endif +#ifdef FORCE_Z14 +#define FORCE +#define ARCHITECTURE "ZARCH" +#define SUBARCHITECTURE "Z14" +#define ARCHCONFIG "-DZ14 " \ + "-DDTB_DEFAULT_ENTRIES=64" +#define LIBNAME "z14" +#define CORENAME "Z14" +#endif + #ifndef FORCE #ifdef USER_TARGET From 4b512f84dd2b5861e6c860f68d05e56484efe7ce Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:16:44 +0100 Subject: [PATCH 058/133] Add cache sizes for Z14 from patch provided by aarnez in #991 --- cpuid_zarch.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 8ed40099b..896ed94f5 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -114,7 +114,14 @@ void get_cpuconfig(void) break; case CPU_Z14: printf("#define Z14\n"); + printf("#define L1_DATA_SIZE 131072\n"); + printf("#define L1_DATA_LINESIZE 256\n"); + printf("#define L1_DATA_ASSOCIATIVE 8\n"); + printf("#define L2_SIZE 4194304\n"); + printf("#define L2_LINESIZE 256\n"); + printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); break; } } From 885a3c435092f5356ee4665b03d3709ce58a22f1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:18:09 +0100 Subject: [PATCH 059/133] USE_TRMM on Z14 from patch provided by aarnez in #991 --- kernel/Makefile.L3 | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 9258f216d..eafcfb1b4 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -48,6 +48,10 @@ ifeq ($(ARCH), zarch) USE_TRMM = 1 endif +ifeq ($(CORE), Z14) +USE_TRMM = 1 +endif + From 265142edd5dc4c8d7e5e9f781468ac9c5bddb3ba Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:21:40 +0100 Subject: [PATCH 060/133] Fix typo in the zarch min/max kernels from patch provided by aarnez in #991 --- kernel/zarch/damax.c | 2 +- kernel/zarch/damin.c | 2 +- kernel/zarch/dmax.c | 2 +- kernel/zarch/dmin.c | 2 +- kernel/zarch/samax.c | 2 +- kernel/zarch/samin.c | 2 +- kernel/zarch/smax.c | 2 +- kernel/zarch/smin.c | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index a3d63fe53..827467189 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -81,7 +81,7 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxdb %%v0,%%v0,%%16,8 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 738ed8710..821f9eccc 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -81,7 +81,7 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,8 \n\t" - "vfmindb %%v0,%%v0,%%16,8 \n\t" + "vfmindb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index aa8b932f9..5ec54c7bf 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -74,7 +74,7 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) "vfmaxdb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxdb %%v0,%%v0,%%16,0 \n\t" + "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 8ae5fe868..073289186 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -74,7 +74,7 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) "vfmindb %%v16,%%v16,%%v17,0 \n\t" - "vfmindb %%v0,%%v0,%%16,0 \n\t" + "vfmindb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index c8d831d06..b629d64c0 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -81,7 +81,7 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - "vfmaxsb %%v0,%%v0,%%16,8 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index dd24c74d7..7ce6ee657 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -81,7 +81,7 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,8 \n\t" - "vfminsb %%v0,%%v0,%%16,8 \n\t" + "vfminsb %%v0,%%v0,%%v16,8 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index 8a2b86dc1..e492d739c 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -74,7 +74,7 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) "vfmaxsb %%v16,%%v16,%%v17,0 \n\t" - "vfmaxsb %%v0,%%v0,%%16,0 \n\t" + "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index b87ec0fe8..e7d83441b 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -74,7 +74,7 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) "vfminsb %%v16,%%v16,%%v17,0 \n\t" - "vfminsb %%v0,%%v0,%%16,0 \n\t" + "vfminsb %%v0,%%v0,%%v16,0 \n\t" "agfi %%r1, 256 \n\t" "brctg %%r0, 0b \n\t" From 877023e1e194faf5e42e2bb2d0771b52b52fed94 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:22:26 +0100 Subject: [PATCH 061/133] Fix precision of zarch DSDOT from patch provided by aarnez in #991 --- kernel/zarch/dsdot.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 800bb0d51..72950c9f4 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -132,7 +132,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[i] * x[i] ; + dot += (double) y[i] * (double) x[i] ; i++ ; } @@ -146,7 +146,8 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n1) { - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; ix += inc_x*2 ; iy += inc_y*2 ; i+=2 ; @@ -156,7 +157,7 @@ double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) while(i < n) { - dot += y[iy] * x[ix] ; + dot += (double) y[iy] * (double) x[ix] ; ix += inc_x ; iy += inc_y ; i++ ; From cce574c3e0763af7a5017f20fa36959c896fc4fa Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 31 Jan 2019 21:24:55 +0100 Subject: [PATCH 062/133] Improve the z14 SGEMVT kernel from patch provided by aarnez in #991 --- sgemv_t_4.c | 811 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 811 insertions(+) create mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c new file mode 100644 index 000000000..a3136723a --- /dev/null +++ b/sgemv_t_4.c @@ -0,0 +1,811 @@ +/*************************************************************************** +Copyright (c) 2017, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + +#define NBMAX 2048 + +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + "pfd 1,1024(%%r1,%4) \n\t" + "pfd 1,1024(%%r1,%5) \n\t" + + "vl %%v16,0(%%r1,%5) \n\t" + "vl %%v17,16(%%r1,%5) \n\t" + "vl %%v18,32(%%r1,%5) \n\t" + "vl %%v19,48(%%r1,%5) \n\t" + "vl %%v20,64(%%r1,%5) \n\t" + "vl %%v21,80(%%r1,%5) \n\t" + "vl %%v22,96(%%r1,%5) \n\t" + "vl %%v23,112(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "vl %%v28,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" + "vl %%v29,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" + "vl %%v30,16(%%r1,%3) \n\t" + "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" + "vl %%v31,16(%%r1,%4) \n\t" + "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" + + "vl %%v24,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" + "vl %%v25,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" + "vl %%v26,32(%%r1,%3) \n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" + "vl %%v27,32(%%r1,%4) \n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" + + "vl %%v28,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" + "vl %%v29,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" + "vl %%v30,48(%%r1,%3) \n\t" + "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" + "vl %%v31,48(%%r1,%4) \n\t" + "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + "vl %%v26,64(%%r1,%3) \n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" + "vl %%v27,64(%%r1,%4) \n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" + + "vl %%v28,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" + "vl %%v29,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" + "vl %%v30,80(%%r1,%3) \n\t" + "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" + "vl %%v31,80(%%r1,%4) \n\t" + "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" + + "vl %%v24,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" + "vl %%v25,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" + "vl %%v26,96(%%r1,%3) \n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" + "vl %%v27,96(%%r1,%4) \n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" + + "vl %%v28,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" + "vl %%v29,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" + "vl %%v30,112(%%r1,%3) \n\t" + "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" + "vl %%v31,112(%%r1,%4) \n\t" + "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%5) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + "vl %%v26,0(%%r1,%3) \n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" + "vl %%v27,0(%%r1,%4) \n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" + "aebr %%f0,%%f4 \n\t" + "ste %%f0,0(%6) \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" + "aebr %%f1,%%f4 \n\t" + "ste %%f1,4(%6) \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" + "aebr %%f2,%%f4 \n\t" + "ste %%f2,8(%6) \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" + "aebr %%f3,%%f4 \n\t" + "ste %%f3,12(%6) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) + :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 1,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%3) \n\t" + "vl %%v17,16(%%r1,%3) \n\t" + "vl %%v18,32(%%r1,%3) \n\t" + "vl %%v19,48(%%r1,%3) \n\t" + "vl %%v20,64(%%r1,%3) \n\t" + "vl %%v21,80(%%r1,%3) \n\t" + "vl %%v22,96(%%r1,%3) \n\t" + "vl %%v23,112(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "vl %%v26,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" + "vl %%v27,16(%%r1,%2) \n\t" + "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" + + "vl %%v28,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" + "vl %%v29,32(%%r1,%2) \n\t" + "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" + + "vl %%v30,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" + "vl %%v31,48(%%r1,%2) \n\t" + "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" + + "vl %%v24,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" + "vl %%v25,64(%%r1,%2) \n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" + + "vl %%v26,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" + "vl %%v27,80(%%r1,%2) \n\t" + "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" + + "vl %%v28,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" + "vl %%v29,96(%%r1,%2) \n\t" + "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" + + "vl %%v30,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" + "vl %%v31,112(%%r1,%2) \n\t" + "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%3) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + "vl %%v25,0(%%r1,%2) \n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" + "aebr %%f0,%%f2 \n\t" + "ste %%f0,0(%4) \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" + "aebr %%f1,%%f2 \n\t" + "ste %%f1,4(%4) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) + :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + __asm__ volatile ( + "vzero %%v0 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%1) \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "vl %%v25,16(%%r1,%1) \n\t" + "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" + + "vl %%v26,32(%%r1,%1) \n\t" + "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" + + "vl %%v27,48(%%r1,%1) \n\t" + "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" + + "vl %%v28,64(%%r1,%1) \n\t" + "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" + + "vl %%v29,80(%%r1,%1) \n\t" + "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" + + "vl %%v30,96(%%r1,%1) \n\t" + "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" + + "vl %%v31,112(%%r1,%1) \n\t" + "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24,0(%%r1,%1) \n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "aebr %%f0,%%f1 \n\t" + "ste %%f0,0(%3) " + : + :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) + :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for (i = 0; i < n; i++) + { + dest[i] = *src; + src += inc_src; + } +} + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) +{ + __asm__ volatile ( + "vlrepf %%v0,%1 \n\t" + "xgr %%r1,%%r1 \n\t" + + "lghi %%r0,-32 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 1f \n\t" + + "srlg %%r0,%%r0,5 \n\t" + "0: \n\t" + "pfd 1,1024(%%r1,%2) \n\t" + "pfd 2,1024(%%r1,%3) \n\t" + + "vl %%v16,0(%%r1,%2) \n\t" + "vl %%v17,16(%%r1,%2) \n\t" + "vl %%v18,32(%%r1,%2) \n\t" + "vl %%v19,48(%%r1,%2) \n\t" + "vl %%v20,64(%%r1,%2) \n\t" + "vl %%v21,80(%%r1,%2) \n\t" + "vl %%v22,96(%%r1,%2) \n\t" + "vl %%v23,112(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + "vl %%v25, 16(%%r1,%3) \n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" + "vst %%v25, 16(%%r1,%3) \n\t" + "vl %%v26, 32(%%r1,%3) \n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" + "vst %%v26, 32(%%r1,%3) \n\t" + "vl %%v27, 48(%%r1,%3) \n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" + "vst %%v27, 48(%%r1,%3) \n\t" + "vl %%v28, 64(%%r1,%3) \n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" + "vst %%v28, 64(%%r1,%3) \n\t" + "vl %%v29, 80(%%r1,%3) \n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" + "vst %%v29, 80(%%r1,%3) \n\t" + "vl %%v30, 96(%%r1,%3) \n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" + "vst %%v30, 96(%%r1,%3) \n\t" + "vl %%v31, 112(%%r1,%3) \n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" + "vst %%v31, 112(%%r1,%3) \n\t" + + "agfi %%r1,128 \n\t" + "brctg %%r0,0b \n\t" + + "1: \n\t" + "lghi %%r0,28 \n\t" + "ngr %%r0,%0 \n\t" + "ltgr %%r0,%%r0 \n\t" + "jz 3f \n\t" + + "srlg %%r0,%%r0,2 \n\t" + "2: \n\t" + "vl %%v16,0(%%r1,%2) \n\t" + + "vl %%v24, 0(%%r1,%3) \n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" + "vst %%v24, 0(%%r1,%3) \n\t" + + "agfi %%r1,16 \n\t" + "brctg %%r0,2b \n\t" + + "3: \n\t" + "nop " + : + :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) + :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); +} +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else + { + BLASLONG i; + for (i = 0; i < n; i++) + { + *dest += src[i] * da; + dest += inc_dest; + } + } +} + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG register i; + BLASLONG register j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + BLASLONG n0; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT ybuffer[2] __attribute__ ((aligned(16))); + FLOAT *xbuffer; + FLOAT *ytemp; + + if ( m < 1 ) return(0); + if ( n < 1 ) return(0); + + xbuffer = buffer; + ytemp = buffer + (m < NBMAX ? m : NBMAX); + + n0 = n / NBMAX; + n1 = (n % NBMAX) >> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } + + if ( n2 & 2 ) + { + + sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; + + } + + if ( n2 & 1 ) + { + + sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; + + } + a += NB; + x += NB * inc_x; + } + + if ( m3 == 0 ) return(0); + + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j Date: Fri, 1 Feb 2019 12:57:01 +0100 Subject: [PATCH 063/133] Delete misplaced file sgemv_t_4.c from #1993 , file should have gone into kernel/zarch --- sgemv_t_4.c | 811 ---------------------------------------------------- 1 file changed, 811 deletions(-) delete mode 100644 sgemv_t_4.c diff --git a/sgemv_t_4.c b/sgemv_t_4.c deleted file mode 100644 index a3136723a..000000000 --- a/sgemv_t_4.c +++ /dev/null @@ -1,811 +0,0 @@ -/*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } -} - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } - - if ( n2 & 2 ) - { - - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; - - } - - if ( n2 & 1 ) - { - - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; - - } - a += NB; - x += NB * inc_x; - } - - if ( m3 == 0 ) return(0); - - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j Date: Fri, 1 Feb 2019 12:58:59 +0100 Subject: [PATCH 064/133] Fix incorrect sgemv results for IBM z14 part of PR #1993 that was inadvertently misplaced into the toplevel directory --- kernel/zarch/sgemv_t_4.c | 60 +++++++++++++++------------------------- 1 file changed, 22 insertions(+), 38 deletions(-) diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 5515d7bb7..a3136723a 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -158,32 +158,24 @@ static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,2 \n\t" - "aebr %%f0,%%f4 \n\t" - "vrepf %%v4,%%v0,3 \n\t" + "veslg %%v4,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v4 \n\t" + "vrepg %%v4,%%v0,1 \n\t" "aebr %%f0,%%f4 \n\t" "ste %%f0,0(%6) \n\t" - "vrepf %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,2 \n\t" - "aebr %%f1,%%f4 \n\t" - "vrepf %%v4,%%v1,3 \n\t" + "veslg %%v4,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v4 \n\t" + "vrepg %%v4,%%v1,1 \n\t" "aebr %%f1,%%f4 \n\t" "ste %%f1,4(%6) \n\t" - "vrepf %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,2 \n\t" - "aebr %%f2,%%f4 \n\t" - "vrepf %%v4,%%v2,3 \n\t" + "veslg %%v4,%%v2,32 \n\t" + "vfasb %%v2,%%v2,%%v4 \n\t" + "vrepg %%v4,%%v2,1 \n\t" "aebr %%f2,%%f4 \n\t" "ste %%f2,8(%6) \n\t" - "vrepf %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,2 \n\t" - "aebr %%f3,%%f4 \n\t" - "vrepf %%v4,%%v3,3 \n\t" + "veslg %%v4,%%v3,32 \n\t" + "vfasb %%v3,%%v3,%%v4 \n\t" + "vrepg %%v4,%%v3,1 \n\t" "aebr %%f3,%%f4 \n\t" "ste %%f3,12(%6) " : @@ -281,18 +273,14 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "aebr %%f0,%%f2 \n\t" - "vrepf %%v2,%%v0,3 \n\t" + "veslg %%v2,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v2 \n\t" + "vrepg %%v2,%%v0,1 \n\t" "aebr %%f0,%%f2 \n\t" "ste %%f0,0(%4) \n\t" - "vrepf %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,2 \n\t" - "aebr %%f1,%%f2 \n\t" - "vrepf %%v2,%%v1,3 \n\t" + "veslg %%v2,%%v1,32 \n\t" + "vfasb %%v1,%%v1,%%v2 \n\t" + "vrepg %%v2,%%v1,1 \n\t" "aebr %%f1,%%f2 \n\t" "ste %%f1,4(%4) " : @@ -349,7 +337,7 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "vl %%v31,112(%%r1,%1) \n\t" "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - + "agfi %%r1,128 \n\t" "brctg %%r0,0b \n\t" @@ -370,11 +358,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) "brctg %%r0,2b \n\t" "3: \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "vrepf %%v1,%%v0,3 \n\t" + "veslg %%v1,%%v0,32 \n\t" + "vfasb %%v0,%%v0,%%v1 \n\t" + "vrepg %%v1,%%v0,1 \n\t" "aebr %%f0,%%f1 \n\t" "ste %%f0,0(%3) " : @@ -823,5 +809,3 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO return(0); } - - From 81daf6bc380c22bcc7ce228952e5435bc79bb0ce Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:30:38 +0200 Subject: [PATCH 065/133] [ZARCH] Format source code, Fix constraints --- kernel/zarch/camax.c | 370 +++++----- kernel/zarch/camin.c | 370 +++++----- kernel/zarch/casum.c | 236 +++---- kernel/zarch/caxpy.c | 232 +++---- kernel/zarch/ccopy.c | 102 ++- kernel/zarch/cdot.c | 254 ++++--- kernel/zarch/cgemv_n_4.c | 1263 +++++++++++++++++----------------- kernel/zarch/cgemv_t_4.c | 1179 ++++++++++++++++---------------- kernel/zarch/crot.c | 413 ++++++----- kernel/zarch/cscal.c | 684 +++++++++---------- kernel/zarch/cswap.c | 263 ++++--- kernel/zarch/damax.c | 220 +++--- kernel/zarch/damax_z13.c | 292 ++++---- kernel/zarch/damin.c | 220 +++--- kernel/zarch/damin_z13.c | 292 ++++---- kernel/zarch/dasum.c | 248 ++++--- kernel/zarch/daxpy.c | 253 ++++--- kernel/zarch/dcopy.c | 76 +-- kernel/zarch/ddot.c | 196 +++--- kernel/zarch/dgemv_n_4.c | 1200 +++++++++++++++----------------- kernel/zarch/dgemv_t_4.c | 1397 ++++++++++++++++++-------------------- kernel/zarch/dmax.c | 214 +++--- kernel/zarch/dmax_z13.c | 252 ++++--- kernel/zarch/dmin.c | 214 +++--- kernel/zarch/dmin_z13.c | 252 ++++--- kernel/zarch/drot.c | 381 +++++------ kernel/zarch/dscal.c | 278 ++++---- kernel/zarch/dsdot.c | 246 +++---- kernel/zarch/dswap.c | 228 +++---- kernel/zarch/icamax.c | 515 +++++++------- kernel/zarch/icamin.c | 515 +++++++------- kernel/zarch/idamax.c | 411 ++++++----- kernel/zarch/idamin.c | 411 ++++++----- kernel/zarch/idmax.c | 385 +++++------ kernel/zarch/idmin.c | 385 +++++------ kernel/zarch/isamax.c | 496 +++++++------- kernel/zarch/isamin.c | 496 +++++++------- kernel/zarch/ismax.c | 458 ++++++------- kernel/zarch/ismin.c | 458 ++++++------- kernel/zarch/izamax.c | 409 ++++++----- kernel/zarch/izamin.c | 409 ++++++----- kernel/zarch/samax.c | 225 +++--- kernel/zarch/samin.c | 225 +++--- kernel/zarch/sasum.c | 252 ++++--- kernel/zarch/saxpy.c | 253 ++++--- kernel/zarch/scopy.c | 76 +-- kernel/zarch/sdot.c | 188 ++--- kernel/zarch/sgemv_n_4.c | 1157 +++++++++++++++---------------- kernel/zarch/sgemv_t_4.c | 1380 ++++++++++++++++++------------------- kernel/zarch/smax.c | 219 +++--- kernel/zarch/smin.c | 219 +++--- kernel/zarch/srot.c | 381 +++++------ kernel/zarch/sscal.c | 268 ++++---- kernel/zarch/sswap.c | 230 +++---- kernel/zarch/zamax.c | 333 +++++---- kernel/zarch/zamax_z13.c | 352 +++++----- kernel/zarch/zamin.c | 317 ++++----- kernel/zarch/zamin_z13.c | 336 +++++---- kernel/zarch/zasum.c | 232 +++---- kernel/zarch/zaxpy.c | 232 +++---- kernel/zarch/zcopy.c | 102 ++- kernel/zarch/zdot.c | 246 ++++--- kernel/zarch/zgemv_n_4.c | 1147 +++++++++++++++---------------- kernel/zarch/zgemv_t_4.c | 1099 +++++++++++++++--------------- kernel/zarch/zrot.c | 413 ++++++----- kernel/zarch/zscal.c | 676 +++++++++--------- kernel/zarch/zswap.c | 263 ++++--- 67 files changed, 13393 insertions(+), 14601 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 2e9648640..40a9903e9 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = camax_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + maxf = camax_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index aec59058e..842635afc 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,214 +28,188 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vlef %%v0,0(%2),0 \n\t" - "vlef %%v16,4(%2),0 \n\t" - "vlef %%v0,8(%2),1 \n\t" - "vlef %%v16,12(%2),1 \n\t" - "vlef %%v0,16(%2),2 \n\t" - "vlef %%v16,20(%2),2 \n\t" - "vlef %%v0,24(%2),3 \n\t" - "vlef %%v16,28(%2),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v16,%%v16 \n\t" - "vfasb %%v0,%%v0,%%v16 \n\t" - "vleib %%v1,0,0 \n\t" - "vleib %%v1,1,1 \n\t" - "vleib %%v1,2,2 \n\t" - "vleib %%v1,3,3 \n\t" - "vleib %%v1,8,4 \n\t" - "vleib %%v1,9,5 \n\t" - "vleib %%v1,10,6 \n\t" - "vleib %%v1,11,7 \n\t" - "vleib %%v1,16,8 \n\t" - "vleib %%v1,17,9 \n\t" - "vleib %%v1,18,10 \n\t" - "vleib %%v1,19,11 \n\t" - "vleib %%v1,24,12 \n\t" - "vleib %%v1,25,13 \n\t" - "vleib %%v1,26,14 \n\t" - "vleib %%v1,27,15 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v2,16(%%r1,%2) \n\t" - "vpkg %%v17,%%v16,%%v2 \n\t" - "vperm %%v16,%%v16,%%v2,%%v1 \n\t" - - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v2,48(%%r1,%2) \n\t" - "vpkg %%v19,%%v18,%%v2 \n\t" - "vperm %%v18,%%v18,%%v2,%%v1 \n\t" - - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v2,80(%%r1,%2) \n\t" - "vpkg %%v21,%%v20,%%v2 \n\t" - "vperm %%v20,%%v20,%%v2,%%v1 \n\t" - - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v2,112(%%r1,%2) \n\t" - "vpkg %%v23,%%v22,%%v2 \n\t" - "vperm %%v22,%%v22,%%v2,%%v1 \n\t" - - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v2,144(%%r1,%2) \n\t" - "vpkg %%v25,%%v24,%%v2 \n\t" - "vperm %%v24,%%v24,%%v2,%%v1 \n\t" - - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v2,176(%%r1,%2) \n\t" - "vpkg %%v27,%%v26,%%v2 \n\t" - "vperm %%v26,%%v26,%%v2,%%v1 \n\t" - - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v2,208(%%r1,%2) \n\t" - "vpkg %%v29,%%v28,%%v2 \n\t" - "vperm %%v28,%%v28,%%v2,%%v1 \n\t" - - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v2,240(%%r1,%2) \n\t" - "vpkg %%v31,%%v30,%%v2 \n\t" - "vperm %%v30,%%v30,%%v2,%%v1 \n\t" - - "vflpsb %%v16,%%v16 \n\t" - "vflpsb %%v17,%%v17 \n\t" - "vflpsb %%v18,%%v18 \n\t" - "vflpsb %%v19,%%v19 \n\t" - "vflpsb %%v20,%%v20 \n\t" - "vflpsb %%v21,%%v21 \n\t" - "vflpsb %%v22,%%v22 \n\t" - "vflpsb %%v23,%%v23 \n\t" - "vflpsb %%v24,%%v24 \n\t" - "vflpsb %%v25,%%v25 \n\t" - "vflpsb %%v26,%%v26 \n\t" - "vflpsb %%v27,%%v27 \n\t" - "vflpsb %%v28,%%v28 \n\t" - "vflpsb %%v29,%%v29 \n\t" - "vflpsb %%v30,%%v30 \n\t" - "vflpsb %%v31,%%v31 \n\t" - - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v18,%%v18,%%v19 \n\t" - "vfasb %%v20,%%v20,%%v21 \n\t" - "vfasb %%v22,%%v22,%%v23 \n\t" - "vfasb %%v24,%%v24,%%v25 \n\t" - "vfasb %%v26,%%v26,%%v27 \n\t" - "vfasb %%v28,%%v28,%%v29 \n\t" - "vfasb %%v30,%%v30,%%v31 \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) + +static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = camin_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + minf = camin_kernel_32(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f4ebc21bd..f59e5a20b 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,140 +28,128 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if ( inc_x == 1 ) - { + if (n <= 0 || inc_x <= 0) + return (sumf); - n1 = n & -32; - if ( n1 > 0 ) - { + if (inc_x == 1) { - sumf = casum_kernel_32(n1, x); - i=n1; - ip=2*n1; - } + n1 = n & -32; + if (n1 > 0) { - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = casum_kernel_32(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index fe5568cc8..d86342bd0 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,148 +27,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%3) \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v1,4(%3),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%3),1 \n\t" - "vlef %%v1,4(%3),3 \n\t" -#else - "vlef %%v0,0(%3),1 \n\t" - "vlef %%v0,0(%3),3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v0,0(%3),2 \n\t" - "vlrepf %%v1,4(%3) \n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" +#else + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - - "vfmasb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmasb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmasb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmasb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmasb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmasb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmasb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmasb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if (n1) { - da[0] = da_r; - da[1] = da_i; - caxpy_kernel_16(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + caxpy_kernel_16(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index fc0b8d648..1b93a812e 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - ccopy_kernel_32(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + ccopy_kernel_32(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 3eda2979b..64d81ae5c 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,156 +27,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - "verllg %%v22,%%v18,32 \n\t" - "verllg %%v23,%%v19,32 \n\t" - - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v24,%%v24,%%v28 \n\t" - "vfasb %%v24,%%v24,%%v30 \n\t" - "vrepg %%v26,%%v24,1 \n\t" - "vfasb %%v24,%%v24,%%v26 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vfasb %%v25,%%v25,%%v29 \n\t" - "vfasb %%v25,%%v25,%%v31 \n\t" - "vrepg %%v27,%%v25,1 \n\t" - "vfasb %%v25,%%v25,%%v27 \n\t" - "vstef %%v24,0(%3),0 \n\t" - "vstef %%v24,4(%3),1 \n\t" - "vstef %%v25,8(%3),1 \n\t" - "vstef %%v25,12(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -16; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - cdot_kernel_16(n1, x, y, dot); + BLASLONG n1 = n & -16; - i = n1; - BLASLONG j = i * 2; + if (n1) + cdot_kernel_16(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index ed81325e1..db91d9063 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,719 +25,720 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 2048 -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%5) \n\t" - "vlrepg %%v17,8(%5) \n\t" - "vlrepg %%v18,16(%5) \n\t" - "vlrepg %%v19,24(%5) \n\t" +static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%5),1 \n\t" - "vlef %%v20,0(%5),3 \n\t" - "vflcsb %%v20,%%v20 \n\t" - "vlef %%v20,4(%5),0 \n\t" - "vlef %%v20,4(%5),2 \n\t" - - "vlef %%v21,8(%5),1 \n\t" - "vlef %%v21,8(%5),3 \n\t" - "vflcsb %%v21,%%v21 \n\t" - "vlef %%v21,12(%5),0 \n\t" - "vlef %%v21,12(%5),2 \n\t" - - "vlef %%v22,16(%5),1 \n\t" - "vlef %%v22,16(%5),3 \n\t" - "vflcsb %%v22,%%v22 \n\t" - "vlef %%v22,20(%5),0 \n\t" - "vlef %%v22,20(%5),2 \n\t" - - "vlef %%v23,24(%5),1 \n\t" - "vlef %%v23,24(%5),3 \n\t" - "vflcsb %%v23,%%v23 \n\t" - "vlef %%v23,28(%5),0 \n\t" - "vlef %%v23,28(%5),2 \n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlef %%v24,0(%%r1,%1),0 \n\t" - "vlef %%v24,0(%%r1,%1),1 \n\t" - "vlef %%v24,8(%%r1,%1),2 \n\t" - "vlef %%v24,8(%%r1,%1),3 \n\t" - "vlef %%v25,4(%%r1,%1),0 \n\t" - "vlef %%v25,4(%%r1,%1),1 \n\t" - "vlef %%v25,12(%%r1,%1),2 \n\t" - "vlef %%v25,12(%%r1,%1),3 \n\t" - "vlef %%v26,0(%%r1,%2),0 \n\t" - "vlef %%v26,0(%%r1,%2),1 \n\t" - "vlef %%v26,8(%%r1,%2),2 \n\t" - "vlef %%v26,8(%%r1,%2),3 \n\t" - "vlef %%v27,4(%%r1,%2),0 \n\t" - "vlef %%v27,4(%%r1,%2),1 \n\t" - "vlef %%v27,12(%%r1,%2),2 \n\t" - "vlef %%v27,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlef %%v28,0(%%r1,%3),0 \n\t" - "vlef %%v28,0(%%r1,%3),1 \n\t" - "vlef %%v28,8(%%r1,%3),2 \n\t" - "vlef %%v28,8(%%r1,%3),3 \n\t" - "vlef %%v29,4(%%r1,%3),0 \n\t" - "vlef %%v29,4(%%r1,%3),1 \n\t" - "vlef %%v29,12(%%r1,%3),2 \n\t" - "vlef %%v29,12(%%r1,%3),3 \n\t" - "vlef %%v30,0(%%r1,%4),0 \n\t" - "vlef %%v30,0(%%r1,%4),1 \n\t" - "vlef %%v30,8(%%r1,%4),2 \n\t" - "vlef %%v30,8(%%r1,%4),3 \n\t" - "vlef %%v31,4(%%r1,%4),0 \n\t" - "vlef %%v31,4(%%r1,%4),1 \n\t" - "vlef %%v31,12(%%r1,%4),2 \n\t" - "vlef %%v31,12(%%r1,%4),3 \n\t" - - "vfmasb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%3) \n\t" - "vlrepg %%v17,8(%3) \n\t" +static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%3),1 \n\t" - "vlef %%v18,0(%3),3 \n\t" - "vflcsb %%v18,%%v18 \n\t" - "vlef %%v18,4(%3),0 \n\t" - "vlef %%v18,4(%3),2 \n\t" - - "vlef %%v19,8(%3),1 \n\t" - "vlef %%v19,8(%3),3 \n\t" - "vflcsb %%v19,%%v19 \n\t" - "vlef %%v19,12(%3),0 \n\t" - "vlef %%v19,12(%3),2 \n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlef %%v20,0(%%r1,%1),0 \n\t" - "vlef %%v20,0(%%r1,%1),1 \n\t" - "vlef %%v20,8(%%r1,%1),2 \n\t" - "vlef %%v20,8(%%r1,%1),3 \n\t" - "vlef %%v21,4(%%r1,%1),0 \n\t" - "vlef %%v21,4(%%r1,%1),1 \n\t" - "vlef %%v21,12(%%r1,%1),2 \n\t" - "vlef %%v21,12(%%r1,%1),3 \n\t" - "vlef %%v22,0(%%r1,%2),0 \n\t" - "vlef %%v22,0(%%r1,%2),1 \n\t" - "vlef %%v22,8(%%r1,%2),2 \n\t" - "vlef %%v22,8(%%r1,%2),3 \n\t" - "vlef %%v23,4(%%r1,%2),0 \n\t" - "vlef %%v23,4(%%r1,%2),1 \n\t" - "vlef %%v23,12(%%r1,%2),2 \n\t" - "vlef %%v23,12(%%r1,%2),3 \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vlrepg %%v16,0(%2) \n\t" +static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%2),1 \n\t" - "vlef %%v17,0(%2),3 \n\t" - "vflcsb %%v17,%%v17 \n\t" - "vlef %%v17,4(%2),0 \n\t" - "vlef %%v17,4(%2),2 \n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlef %%v18,0(%%r1,%1),0 \n\t" - "vlef %%v18,0(%%r1,%1),1 \n\t" - "vlef %%v18,8(%%r1,%1),2 \n\t" - "vlef %%v18,8(%%r1,%1),3 \n\t" - "vlef %%v19,4(%%r1,%1),0 \n\t" - "vlef %%v19,4(%%r1,%1),1 \n\t" - "vlef %%v19,12(%%r1,%1),2 \n\t" - "vlef %%v19,12(%%r1,%1),3 \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,0b \n\t" - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepf %%v0,%3 \n\t" - "vlef %%v1,%4,0 \n\t" - "vlef %%v1,%4,2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,%4,1 \n\t" - "vlef %%v1,%4,3 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%3,1 \n\t" - "vlef %%v0,%3,3 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,%3,0 \n\t" - "vlef %%v0,%3,2 \n\t" - "vlrepf %%v1,%4 \n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,0(%%r1,%2) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "verllg %%v20,%%v16,32 \n\t" - "verllg %%v21,%%v17,32 \n\t" - - "vfmasb %%v22,%%v16,%%v0,%%v18 \n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19 \n\t" - - "vfmasb %%v22,%%v20,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23 \n\t" - - "vst %%v22,0(%%r1,%2) \n\t" - "vst %%v23,16(%%r1,%2) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - cgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - cgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - cgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - cgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return(0); - } + return (0); + } + if (m3 == 1) { - if ( m3 == 1 ) - { + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index f04a624ac..669d78a9d 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - crot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + crot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index 0c15c5add..a2d5bf223 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,430 +27,400 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "vlef %%v1,4(%1),0 \n\t" - "vlef %%v1,4(%1),2 \n\t" - "vflcsb %%v1,%%v1 \n\t" - "vlef %%v1,4(%1),1 \n\t" - "vlef %%v1,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v24,%%v16,32 \n\t" - "verllg %%v25,%%v17,32 \n\t" - "verllg %%v26,%%v18,32 \n\t" - "verllg %%v27,%%v19,32 \n\t" - "verllg %%v28,%%v20,32 \n\t" - "verllg %%v29,%%v21,32 \n\t" - "verllg %%v30,%%v22,32 \n\t" - "verllg %%v31,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlef %%v0,4(%1),0 \n\t" - "vlef %%v0,4(%1),2 \n\t" - "vflcsb %%v0,%%v0 \n\t" - "vlef %%v0,4(%1),1 \n\t" - "vlef %%v0,4(%1),3 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "verllg %%v16,%%v16,32 \n\t" - "verllg %%v17,%%v17,32 \n\t" - "verllg %%v18,%%v18,32 \n\t" - "verllg %%v19,%%v19,32 \n\t" - "verllg %%v20,%%v20,32 \n\t" - "verllg %%v21,%%v21,32 \n\t" - "verllg %%v22,%%v22,32 \n\t" - "verllg %%v23,%%v23,32 \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepf %%v0,0(%1) \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmsb %%v16,%%v16,%%v0 \n\t" - "vfmsb %%v17,%%v17,%%v0 \n\t" - "vfmsb %%v18,%%v18,%%v0 \n\t" - "vfmsb %%v19,%%v19,%%v0 \n\t" - "vfmsb %%v20,%%v20,%%v0 \n\t" - "vfmsb %%v21,%%v21,%%v0 \n\t" - "vfmsb %%v22,%%v22,%%v0 \n\t" - "vfmsb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlef %%v0,4(%[alpha]),0\n\t" + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepf %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + cscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -16; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -16; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - cscal_kernel_16_zero(n1, x); - else - cscal_kernel_16_zero_r(n1, alpha, x); - else - if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); - else - cscal_kernel_16(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + cscal_kernel_16_zero(n1, x); + else + cscal_kernel_16_zero_r(n1, alpha, x); + else if (da_i == 0) + cscal_kernel_16_zero_i(n1, alpha, x); + else + cscal_kernel_16(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 256995d50..92a81591f 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - cswap_kernel_32(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + cswap_kernel_32(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 827467189..37008f702 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxdb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxdb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxdb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxdb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxdb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 95b94ee4a..530d6e5bb 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = damax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = damax_kernel_32(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 821f9eccc..a01791741 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,139 +28,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,8 \n\t" - "vfmindb %%v17,%%v17,%%v25,8 \n\t" - "vfmindb %%v18,%%v18,%%v26,8 \n\t" - "vfmindb %%v19,%%v19,%%v27,8 \n\t" - "vfmindb %%v20,%%v20,%%v28,8 \n\t" - "vfmindb %%v21,%%v21,%%v29,8 \n\t" - "vfmindb %%v22,%%v22,%%v30,8 \n\t" - "vfmindb %%v23,%%v23,%%v31,8 \n\t" - - "vfmindb %%v16,%%v16,%%v20,8 \n\t" - "vfmindb %%v17,%%v17,%%v21,8 \n\t" - "vfmindb %%v18,%%v18,%%v22,8 \n\t" - "vfmindb %%v19,%%v19,%%v23,8 \n\t" - - "vfmindb %%v16,%%v16,%%v18,8 \n\t" - "vfmindb %%v17,%%v17,%%v19,8 \n\t" - - "vfmindb %%v16,%%v16,%%v17,8 \n\t" - - "vfmindb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,8 \n\t" - "lpdr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 538690ee5..2172b6d6f 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,177 +28,157 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = damin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = damin_kernel_32(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index fea431c34..9f69a9931 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,145 +28,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabs + +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -32; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = dasum_kernel_32(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -32; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = dasum_kernel_32(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index e8823745e..179ef8834 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepg %%v0,%3 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmadb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmadb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmadb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmadb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmadb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepg %%v0,%[alpha]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - daxpy_kernel_32(n1, x, y , &da); + if (n1) + daxpy_kernel_32(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index bb5325693..f7cbf54b2 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,5 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - dcopy_kernel_32(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index ff4c347a6..f5f601717 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,123 +27,127 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = ddot_kernel_16(n1, x, y); + if (n1) + dot = ddot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { - - dot += y[i] * x[i] ; - i++ ; - - } - return(dot); + i = n1; + while (i < n) { + dot += y[i] * x[i]; + i++; } + return (dot); - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; + } - BLASLONG n1 = n & -4; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = y[iy] * x[ix] ; - FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; + while (i < n1) { - FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; - FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; + FLOAT m1 = y[iy] * x[ix]; + FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; + FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; + FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; - temp1 += m1+m3; - temp2 += m2+m4; + ix += inc_x * 4; + iy += inc_y * 4; - i+=4 ; + temp1 += m1 + m3; + temp2 += m2 + m4; - } + i += 4; - while(i < n) - { + } - temp1 += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - dot = temp1 + temp2; - return(dot); - -} + temp1 += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + dot = temp1 + temp2; + return (dot); +} diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index ca4fd6170..c93ff9b54 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,663 +29,579 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%5) \n\t" - "vlrepg %%v1,8(%5) \n\t" - "vlrepg %%v2,16(%5) \n\t" - "vlrepg %%v3,24(%5) \n\t" - "vlrepg %%v4,%7 \n\t" - "vfmdb %%v0,%%v0,%%v4 \n\t" - "vfmdb %%v1,%%v1,%%v4 \n\t" - "vfmdb %%v2,%%v2,%%v4 \n\t" - "vfmdb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmadb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmadb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmadb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmadb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%3) \n\t" - "vlrepg %%v1,8(%3) \n\t" - "vlrepg %%v2,%5 \n\t" - "vfmdb %%v0,%%v0,%%v2 \n\t" - "vfmdb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmadb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmadb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmadb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmadb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmadb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmadb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmadb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmadb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepg %%v0,0(%2) \n\t" - "vlrepg %%v1,%4 \n\t" - "vfmdb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmadb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmadb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmadb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmadb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmadb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmadb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmadb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmadb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepg %%v0,0(%[x])\n\t" + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*8); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 8); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 2d8fa0d10..24680cf1b 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,795 +29,724 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmadb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmadb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmadb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmadb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmadb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmadb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmadb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmadb %%v3,%%v17,%%v31,%%v3 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "adbr %%f0,%%f4 \n\t" - "std %%f0,0(%6) \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "adbr %%f1,%%f4 \n\t" - "std %%f1,8(%6) \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "adbr %%f2,%%f4 \n\t" - "std %%f2,16(%6) \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "adbr %%f3,%%f4 \n\t" - "std %%f3,24(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmadb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmadb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmadb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmadb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmadb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmadb %%v1,%%v17,%%v27,%%v1 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "adbr %%f0,%%f2 \n\t" - "std %%f0,0(%4) \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "adbr %%f1,%%f2 \n\t" - "std %%f1,8(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "std %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-16 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,4 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,12 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepg %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - yp = ytemp; + y_ptr = y; + a_ptr = a; + x_ptr = x; - for( i = 0; i < n1 ; i++) - { - dgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; - } + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - if ( n2 & 2 ) - { + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + yp = ytemp; + for (i = 0; i < nb1; i++) { + dgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 1 ) - { - - dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + } - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + yp = ytemp; - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - if ( inc_y == 1 ) - { + if (n2 & 2) { - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; + dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - for ( j=0; j< ( n & -4 ); j+=4 ) - { + } - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; - y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; - y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; - y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; - aj += lda4; - } + if (n2 & 1) { - for ( ; j< n ; j++ ) - { + dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + // a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + // y_ptr += inc_y; - y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; - aj += lda; - } + } + a += NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + a_ptr = a; + if (m3 == 3) { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; - } - else - { + FLOAT *aj = a_ptr; + y_ptr = y; - for ( j=0; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83e7b02a8..87bccbe55 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT max; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vfchdb %%v26,%%v20,%%v21 \n\t" - "vfchdb %%v27,%%v22,%%v23 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v24,%%v25 \n\t" - "vfchdb %%v29,%%v26,%%v27 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v28,%%v29 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v30,%%v0 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return max; +static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT max; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return max; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) return (maxf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = dmax_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = dmax_kernel_32(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 073289186..518cc262c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,133 +27,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v17,%%v17,%%v25,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v19,%%v19,%%v27,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v21,%%v21,%%v29,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - "vfmindb %%v23,%%v23,%%v31,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v17,%%v17,%%v21,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - "vfmindb %%v19,%%v19,%%v23,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - "vfmindb %%v17,%%v17,%%v19,0 \n\t" - - "vfmindb %%v16,%%v16,%%v17,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index e64f90ee3..91561992f 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,154 +27,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "vl %%v16,128(%%r1,%2) \n\t" - "vl %%v17,144(%%r1,%2) \n\t" - "vl %%v18,160(%%r1,%2) \n\t" - "vl %%v19,176(%%r1,%2) \n\t" - "vl %%v20,192(%%r1,%2) \n\t" - "vl %%v21,208(%%r1,%2) \n\t" - "vl %%v22,224(%%r1,%2) \n\t" - "vl %%v23,240(%%r1,%2) \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vfchdb %%v26,%%v21,%%v20 \n\t" - "vfchdb %%v27,%%v23,%%v22 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - "vsel %%v26,%%v20,%%v21,%%v26 \n\t" - "vsel %%v27,%%v22,%%v23,%%v27 \n\t" - - "vfchdb %%v28,%%v25,%%v24 \n\t" - "vfchdb %%v29,%%v27,%%v26 \n\t" - "vsel %%v28,%%v24,%%v25,%%v28 \n\t" - "vsel %%v29,%%v26,%%v27,%%v29 \n\t" - - "vfchdb %%v30,%%v29,%%v28 \n\t" - "vsel %%v30,%%v28,%%v29,%%v30 \n\t" - - "vfchdb %%v31,%%v0,%%v30 \n\t" - "vsel %%v0,%%v30,%%v0,%%v31 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = dmin_kernel_32(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = dmin_kernel_32(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index c91f95800..8f0197f02 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - drot_kernel_32(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -32; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + drot_kernel_32(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index ccc6dd95d..c944990b5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,179 +27,151 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepg %%v0,%1 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmdb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmdb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmdb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmdb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepg %%v0,%[da]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - - dscal_kernel_16_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - dscal_kernel_16(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + + if (da == 0.0) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + + dscal_kernel_16_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + dscal_kernel_16(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -4; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -4; - x[i]=0.0; - x[i + inc_x]=0.0; - x[i + 2 * inc_x]=0.0; - x[i + 3 * inc_x]=0.0; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = 0.0; + x[i + inc_x] = 0.0; + x[i + 2 * inc_x] = 0.0; + x[i + 3 * inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 4; + j += 4; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -4; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -4; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; - x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; - x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + while (j < n1) { - i += inc_x * 4; - j += 4; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; - } + i += inc_x * 4; + j += 4; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 72950c9f4..1ac02d4b9 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,144 +27,146 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - double dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vlef %%v16,0(%%r1,%2),0 \n\t" - "vlef %%v16,4(%%r1,%2),2 \n\t" - "vlef %%v17,8(%%r1,%2),0 \n\t" - "vlef %%v17,12(%%r1,%2),2 \n\t" - "vlef %%v18,16(%%r1,%2),0 \n\t" - "vlef %%v18,20(%%r1,%2),2 \n\t" - "vlef %%v19,24(%%r1,%2),0 \n\t" - "vlef %%v19,28(%%r1,%2),2 \n\t" - "vlef %%v20,32(%%r1,%2),0 \n\t" - "vlef %%v20,36(%%r1,%2),2 \n\t" - "vlef %%v21,40(%%r1,%2),0 \n\t" - "vlef %%v21,44(%%r1,%2),2 \n\t" - "vlef %%v22,48(%%r1,%2),0 \n\t" - "vlef %%v22,52(%%r1,%2),2 \n\t" - "vlef %%v23,56(%%r1,%2),0 \n\t" - "vlef %%v23,60(%%r1,%2),2 \n\t" - - "vflls %%v16,%%v16 \n\t" - "vflls %%v17,%%v17 \n\t" - "vflls %%v18,%%v18 \n\t" - "vflls %%v19,%%v19 \n\t" - "vflls %%v20,%%v20 \n\t" - "vflls %%v21,%%v21 \n\t" - "vflls %%v22,%%v22 \n\t" - "vflls %%v23,%%v23 \n\t" - - "vlef %%v24,0(%%r1,%3),0 \n\t" - "vlef %%v24,4(%%r1,%3),2 \n\t" - "vflls %%v24,%%v24 \n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0 \n\t" - "vlef %%v25,8(%%r1,%3),0 \n\t" - "vlef %%v25,12(%%r1,%3),2 \n\t" - "vflls %%v25,%%v25 \n\t" - "vfmadb %%v0,%%v17,%%v25,%%v0 \n\t" - "vlef %%v26,16(%%r1,%3),0 \n\t" - "vlef %%v26,20(%%r1,%3),2 \n\t" - "vflls %%v26,%%v26 \n\t" - "vfmadb %%v0,%%v18,%%v26,%%v0 \n\t" - "vlef %%v27,24(%%r1,%3),0 \n\t" - "vlef %%v27,28(%%r1,%3),2 \n\t" - "vflls %%v27,%%v27 \n\t" - "vfmadb %%v0,%%v19,%%v27,%%v0 \n\t" - "vlef %%v28,32(%%r1,%3),0 \n\t" - "vlef %%v28,36(%%r1,%3),2 \n\t" - "vflls %%v28,%%v28 \n\t" - "vfmadb %%v0,%%v20,%%v28,%%v0 \n\t" - "vlef %%v29,40(%%r1,%3),0 \n\t" - "vlef %%v29,44(%%r1,%3),2 \n\t" - "vflls %%v29,%%v29 \n\t" - "vfmadb %%v0,%%v21,%%v29,%%v0 \n\t" - "vlef %%v30,48(%%r1,%3),0 \n\t" - "vlef %%v30,52(%%r1,%3),2 \n\t" - "vflls %%v30,%%v30 \n\t" - "vfmadb %%v0,%%v22,%%v30,%%v0 \n\t" - "vlef %%v31,56(%%r1,%3),0 \n\t" - "vlef %%v31,60(%%r1,%3),2 \n\t" - "vflls %%v31,%%v31 \n\t" - "vfmadb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + double dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -double CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - double dot = 0.0 ; + double dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; + BLASLONG n1 = n & -16; - if ( n1 ) - dot = dsdot_kernel_16(n1,x,y); + if (n1) + dot = dsdot_kernel_16(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += (double) y[i] * (double) x[i] ; - i++ ; + dot += (double) y[i] * (double) x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += (double) y[iy] * (double) x[ix]; + dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += (double) y[iy] * (double) x[ix]; - dot += (double) y[iy+inc_y] * (double) x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += (double) y[iy] * (double) x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += (double) y[iy] * (double) x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 8070ef41a..60ba40bd6 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1 )) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - dswap_kernel_32(n1, x, y); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; - - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dswap_kernel_32(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - + + } + return (0); } diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 5129ca6ee..1e1040a6e 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamax; +} - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (max); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + max = icamax_kernel_32(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - return iamax; -} + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (max + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + max = 0; + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - max = icamax_kernel_32(n1, x, &maxf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (max + 1); + ix += inc_x2 * 4; - } else { - - max = 0; - maxf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index 05068b212..d1c0e32a1 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,285 +27,276 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) -static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - __asm__ volatile ( - "vlef %%v0,0(%3),0 \n\t" - "vlef %%v1,4(%3),0 \n\t" - "vlef %%v0,8(%3),1 \n\t" - "vlef %%v1,12(%3),1 \n\t" - "vlef %%v0,16(%3),2 \n\t" - "vlef %%v1,20(%3),2 \n\t" - "vlef %%v0,24(%3),3 \n\t" - "vlef %%v1,28(%3),3 \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vflpsb %%v1,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,16 \n\t" - "vzero %%v4 \n\t" - "vleib %%v9,0,0 \n\t" - "vleib %%v9,1,1 \n\t" - "vleib %%v9,2,2 \n\t" - "vleib %%v9,3,3 \n\t" - "vleib %%v9,8,4 \n\t" - "vleib %%v9,9,5 \n\t" - "vleib %%v9,10,6 \n\t" - "vleib %%v9,11,7 \n\t" - "vleib %%v9,16,8 \n\t" - "vleib %%v9,17,9 \n\t" - "vleib %%v9,18,10 \n\t" - "vleib %%v9,19,11 \n\t" - "vleib %%v9,24,12 \n\t" - "vleib %%v9,25,13 \n\t" - "vleib %%v9,26,14 \n\t" - "vleib %%v9,27,15 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" +static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v28,16(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" + __asm__("vlef %%v0,0(%[x]),0\n\t" + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v29,48(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v28,144(%%r1,%3) \n\t" - "vpkg %%v17,%%v16,%%v28 \n\t" - "vperm %%v16,%%v16,%%v28,%%v9 \n\t" - - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v29,176(%%r1,%3) \n\t" - "vpkg %%v19,%%v18,%%v29 \n\t" - "vperm %%v18,%%v18,%%v29,%%v9 \n\t" - - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v30,208(%%r1,%3) \n\t" - "vpkg %%v21,%%v20,%%v30 \n\t" - "vperm %%v20,%%v20,%%v30,%%v9 \n\t" - - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v31,240(%%r1,%3) \n\t" - "vpkg %%v23,%%v22,%%v31 \n\t" - "vperm %%v22,%%v22,%%v31,%%v9 \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - "vfasb %%v16,%%v16,%%v17 \n\t" - "vfasb %%v17,%%v18,%%v19 \n\t" - "vfasb %%v18,%%v20,%%v21 \n\t" - "vfasb %%v19,%%v22,%%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" + return iamin; +} - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" + if (n <= 0 || inc_x <= 0) + return (min); - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" + if (inc_x == 1) { - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" + BLASLONG n1 = n & -32; + if (n1 > 0) { - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + min = icamin_kernel_32(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - return iamin; -} + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (min + 1); -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; + } else { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + min = 0; + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - BLASLONG n1 = n & -32; - if (n1 > 0) { + BLASLONG n1 = n & -4; + while (i < n1) { - min = icamin_kernel_32(n1, x, &minf); - ix = n1 * 2; - i = n1; + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (min + 1); + ix += inc_x2 * 4; - } else { - - min = 0; - minf = CABS1(x,0); - inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; + i += 4; - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index e5a1d3a7c..8434c811f 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; +static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) return (max); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = idamax_kernel_32(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = idamax_kernel_32(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } - max = 0; - maxf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + max = 0; + maxf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index a68f7282f..80a37e6c2 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,237 +28,218 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; +static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) return (min); - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -32; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = idamin_kernel_32(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = idamin_kernel_32(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } - min = 0; - minf = ABS(x[0]); + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - BLASLONG n1 = n & -4; - while (j < n1) { + } else { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + min = 0; + minf = ABS(x[0]); - i += inc_x * 4; + BLASLONG n1 = n & -4; + while (j < n1) { - j += 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - } + i += inc_x * 4; + j += 4; + + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 4c3040779..18cdba437 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vfchedb %%v6,%%v20,%%v21 \n\t" - "vfchedb %%v7,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v16,%%v17 \n\t" - "vfchedb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; +static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imax; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; + + if (n <= 0 || inc_x <= 0) + return (max); - if (n <= 0 || inc_x <= 0) return (max); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + max = idmax_kernel_32(n1, x, &maxf); - max = idmax_kernel_32(n1, x, &maxf); + i = n1; + } else { + maxf = x[0]; + i++; + } - i = n1; - } - else - { - maxf = x[0]; - i++; - } + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + } else { - } else { + max = 0; + maxf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index ba1776a49..02ca427e4 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,214 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; - - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,16 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "vleig %%v28,8,0 \n\t" - "vleig %%v28,9,1 \n\t" - "vleig %%v29,10,0 \n\t" - "vleig %%v29,11,1 \n\t" - "vleig %%v30,12,0 \n\t" - "vleig %%v30,13,1 \n\t" - "vleig %%v31,14,0 \n\t" - "vleig %%v31,15,1 \n\t" - "srlg %%r0,%2,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vfchedb %%v6,%%v21,%%v20 \n\t" - "vfchedb %%v7,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - "vsel %%v18,%%v20,%%v21,%%v6 \n\t" - "vsel %%v6,%%v28,%%v29,%%v6 \n\t" - "vsel %%v19,%%v22,%%v23,%%v7 \n\t" - "vsel %%v7,%%v30,%%v31,%%v7 \n\t" - - "vfchedb %%v20,%%v17,%%v16 \n\t" - "vfchedb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v4,%%v4,%%v5,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v5,%%v6,%%v7,%%v21 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; +static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; + + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return imin; } - + BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; + + if (n <= 0 || inc_x <= 0) + return (min); - if (n <= 0 || inc_x <= 0) return (min); + if (inc_x == 1) { - if (inc_x == 1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - BLASLONG n1 = n & -32; - if (n1 > 0) { + min = idmin_kernel_32(n1, x, &minf); - min = idmin_kernel_32(n1, x, &minf); + i = n1; + } else { + minf = x[0]; + i++; + } - i = n1; - } - else - { - minf = x[0]; - i++; - } + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + } else { - } else { + min = 0; + minf = x[0]; + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } + + i += inc_x * 4; + + j += 4; + + } - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index 6e0aaa162..bbb4012aa 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; +static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamax; + return iamax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = isamax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + max = isamax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - max = 0; - maxf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index 266c48f7f..e8b34b934 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,282 +28,262 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif -static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; +static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vflpsb %%v0,%%v0 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return iamin; + return iamin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = isamin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + min = isamin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); - min = 0; - minf = ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index c968ce6fa..a565df503 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) -{ - BLASLONG imax; +static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { + BLASLONG imax; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v16,%%v17 \n\t" - "vfchesb %%v6,%%v18,%%v19 \n\t" - "vfchesb %%v7,%%v20,%%v21 \n\t" - "vfchesb %%v8,%%v22,%%v23 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v16,%%v17 \n\t" - "vfchesb %%v21,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v0,%%v3 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v2,%%v0 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imax),"=m"(*max) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imax; + return imax; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - if (n <= 0 || inc_x <= 0) return (max); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (max); - max = ismax_kernel_64(n1, x, &maxf); + if (inc_x == 1) { - i = n1; - } - else - { - maxf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); + max = ismax_kernel_64(n1, x, &maxf); + i = n1; } else { + maxf = x[0]; + i++; + } + + while (i < n) { + if (x[i] > maxf) { + max = i; + maxf = x[i]; + } + i++; + } + return (max + 1); - max = 0; - maxf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + max = 0; + maxf = x[0]; - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + max = j + 1; + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + max = j + 2; + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + max = j + 3; + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); + while (j < n) { + if (x[i] > maxf) { + max = j; + maxf = x[i]; + } + i += inc_x; + j++; } + return (max + 1); + } } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index 0145b31b3..ff72b2c64 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,259 +27,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) -{ - BLASLONG imin; +static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { + BLASLONG imin; - __asm__ volatile ( - "vl %%v0,0(%3) \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,2,1 \n\t" - "vleig %%v2,1,0 \n\t" - "vleig %%v2,3,1 \n\t" - "vrepig %%v3,32 \n\t" - "vzero %%v4 \n\t" - "vleif %%v24,0,0 \n\t" - "vleif %%v24,1,1 \n\t" - "vleif %%v24,2,2 \n\t" - "vleif %%v24,3,3 \n\t" - "vleif %%v25,4,0 \n\t" - "vleif %%v25,5,1 \n\t" - "vleif %%v25,6,2 \n\t" - "vleif %%v25,7,3 \n\t" - "vleif %%v26,8,0 \n\t" - "vleif %%v26,9,1 \n\t" - "vleif %%v26,10,2 \n\t" - "vleif %%v26,11,3 \n\t" - "vleif %%v27,12,0 \n\t" - "vleif %%v27,13,1 \n\t" - "vleif %%v27,14,2 \n\t" - "vleif %%v27,15,3 \n\t" - "vleif %%v28,16,0 \n\t" - "vleif %%v28,17,1 \n\t" - "vleif %%v28,18,2 \n\t" - "vleif %%v28,19,3 \n\t" - "vleif %%v29,20,0 \n\t" - "vleif %%v29,21,1 \n\t" - "vleif %%v29,22,2 \n\t" - "vleif %%v29,23,3 \n\t" - "vleif %%v30,24,0 \n\t" - "vleif %%v30,25,1 \n\t" - "vleif %%v30,26,2 \n\t" - "vleif %%v30,27,3 \n\t" - "vleif %%v31,28,0 \n\t" - "vleif %%v31,29,1 \n\t" - "vleif %%v31,30,2 \n\t" - "vleif %%v31,31,3 \n\t" - "srlg %%r0,%2,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" + __asm__("vl %%v0,0(%[x])\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vl %%v16,128(%%r1,%3) \n\t" - "vl %%v17,144(%%r1,%3) \n\t" - "vl %%v18,160(%%r1,%3) \n\t" - "vl %%v19,176(%%r1,%3) \n\t" - "vl %%v20,192(%%r1,%3) \n\t" - "vl %%v21,208(%%r1,%3) \n\t" - "vl %%v22,224(%%r1,%3) \n\t" - "vl %%v23,240(%%r1,%3) \n\t" - - "vfchesb %%v5,%%v17,%%v16 \n\t" - "vfchesb %%v6,%%v19,%%v18 \n\t" - "vfchesb %%v7,%%v21,%%v20 \n\t" - "vfchesb %%v8,%%v23,%%v22 \n\t" - "vsel %%v16,%%v16,%%v17,%%v5 \n\t" - "vsel %%v5,%%v24,%%v25,%%v5 \n\t" - "vsel %%v17,%%v18,%%v19,%%v6 \n\t" - "vsel %%v6,%%v26,%%v27,%%v6 \n\t" - "vsel %%v18,%%v20,%%v21,%%v7 \n\t" - "vsel %%v7,%%v28,%%v29,%%v7 \n\t" - "vsel %%v19,%%v22,%%v23,%%v8 \n\t" - "vsel %%v8,%%v30,%%v31,%%v8 \n\t" - - "vfchesb %%v20,%%v17,%%v16 \n\t" - "vfchesb %%v21,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v20 \n\t" - "vsel %%v5,%%v5,%%v6,%%v20 \n\t" - "vsel %%v17,%%v18,%%v19,%%v21 \n\t" - "vsel %%v6,%%v7,%%v8,%%v21 \n\t" - - "vfchesb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v5,%%v5,%%v6,%%v18 \n\t" - "vsegf %%v6,%%v5 \n\t" - "vesrlg %%v5,%%v5,32 \n\t" - "vag %%v5,%%v5,%%v4 \n\t" - "vag %%v6,%%v6,%%v4 \n\t" - - "vfchesb %%v7,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v7 \n\t" - "vsegf %%v8,%%v7 \n\t" - "vesrlg %%v7,%%v7,32 \n\t" - "vsegf %%v7,%%v7 \n\t" - "vsel %%v1,%%v1,%%v5,%%v7 \n\t" - "vsel %%v2,%%v2,%%v6,%%v8 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v3,%%v0,32 \n\t" - "vfchsb %%v4,%%v3,%%v0 \n\t" - "vchlg %%v5,%%v2,%%v1 \n\t" - "vfcesb %%v6,%%v0,%%v3 \n\t" - "vn %%v5,%%v5,%%v6 \n\t" - "vo %%v4,%%v4,%%v5 \n\t" - "vsel %%v0,%%v0,%%v3,%%v4 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v1,%%v2,%%v4 \n\t" - - "vrepf %%v2,%%v0,2 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcsb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vstef %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchsb %%v4,%%v0,%%v2 \n\t" - "vesrlg %%v4,%%v4,32 \n\t" - "vsegf %%v4,%%v4 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "ste %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(imin),"=m"(*min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v4","v5","v6","v7","v8","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return imin; + return imin; } - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - if (n <= 0 || inc_x <= 0) return (min); - - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; + BLASLONG min = 0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (min); - min = ismin_kernel_64(n1, x, &minf); + if (inc_x == 1) { - i = n1; - } - else - { - minf = x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); + min = ismin_kernel_64(n1, x, &minf); + i = n1; } else { + minf = x[0]; + i++; + } + + while (i < n) { + if (x[i] < minf) { + min = i; + minf = x[i]; + } + i++; + } + return (min + 1); - min = 0; - minf = x[0]; + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + min = 0; + minf = x[0]; - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + min = j; + minf = x[i]; + } + if (x[i + inc_x] < minf) { + min = j + 1; + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + min = j + 2; + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + min = j + 3; + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); + while (j < n) { + if (x[i] < minf) { + min = j; + minf = x[i]; + } + i += inc_x; + j++; } + return (min + 1); + } } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 2d1cc2365..48afb8215 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) -{ - BLASLONG iamax; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v16,%%v17 \n\t" - "vfchedb %%v5,%%v18,%%v19 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v16,%%v17 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v2,%%v0 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamax),"=m"(*amax) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamax; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { + BLASLONG iamax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamax; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (max); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(max); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + max = izamax_kernel_16(n1, x, &maxf); + ix = n1 * 2; + i = n1; + } else { + maxf = CABS1(x, 0); + ix += 2; + i++; + } - max = izamax_kernel_16(n1, x, &maxf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); } - else - { - maxf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (max + 1); + return (max + 1); + + } else { - } else { - max = 0; - maxf = CABS1(x,0); + maxf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) > maxf ) - { - max = i; - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + max = i + 1; + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) > maxf) { + max = i + 2; + maxf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) > maxf) { + max = i + 3; + maxf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (max + 1); + + while (i < n) { + if (CABS1(x, ix) > maxf) { + max = i; + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (max + 1); + } } - - diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 676fd7c6d..3edbe3d58 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,219 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include - -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) -{ - BLASLONG iamin; - - __asm__ volatile ( - "vleg %%v0,0(%3),0 \n\t" - "vleg %%v1,8(%3),0 \n\t" - "vleg %%v0,16(%3),1 \n\t" - "vleg %%v1,24(%3),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v1,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vleig %%v1,0,0 \n\t" - "vleig %%v1,1,1 \n\t" - "vrepig %%v2,8 \n\t" - "vzero %%v3 \n\t" - "vleig %%v24,0,0 \n\t" - "vleig %%v24,1,1 \n\t" - "vleig %%v25,2,0 \n\t" - "vleig %%v25,3,1 \n\t" - "vleig %%v26,4,0 \n\t" - "vleig %%v26,5,1 \n\t" - "vleig %%v27,6,0 \n\t" - "vleig %%v27,7,1 \n\t" - "srlg %%r0,%2,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%3) \n\t" - - "vleg %%v16,0(%%r1,%3),0 \n\t" - "vleg %%v17,8(%%r1,%3),0 \n\t" - "vleg %%v16,16(%%r1,%3),1 \n\t" - "vleg %%v17,24(%%r1,%3),1 \n\t" - "vleg %%v18,32(%%r1,%3),0 \n\t" - "vleg %%v19,40(%%r1,%3),0 \n\t" - "vleg %%v18,48(%%r1,%3),1 \n\t" - "vleg %%v19,56(%%r1,%3),1 \n\t" - "vleg %%v20,64(%%r1,%3),0 \n\t" - "vleg %%v21,72(%%r1,%3),0 \n\t" - "vleg %%v20,80(%%r1,%3),1 \n\t" - "vleg %%v21,88(%%r1,%3),1 \n\t" - "vleg %%v22,96(%%r1,%3),0 \n\t" - "vleg %%v23,104(%%r1,%3),0 \n\t" - "vleg %%v22,112(%%r1,%3),1 \n\t" - "vleg %%v23,120(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "vleg %%v16,128(%%r1,%3),0 \n\t" - "vleg %%v17,136(%%r1,%3),0 \n\t" - "vleg %%v16,144(%%r1,%3),1 \n\t" - "vleg %%v17,152(%%r1,%3),1 \n\t" - "vleg %%v18,160(%%r1,%3),0 \n\t" - "vleg %%v19,168(%%r1,%3),0 \n\t" - "vleg %%v18,176(%%r1,%3),1 \n\t" - "vleg %%v19,184(%%r1,%3),1 \n\t" - "vleg %%v20,192(%%r1,%3),0 \n\t" - "vleg %%v21,200(%%r1,%3),0 \n\t" - "vleg %%v20,208(%%r1,%3),1 \n\t" - "vleg %%v21,216(%%r1,%3),1 \n\t" - "vleg %%v22,224(%%r1,%3),0 \n\t" - "vleg %%v23,232(%%r1,%3),0 \n\t" - "vleg %%v22,240(%%r1,%3),1 \n\t" - "vleg %%v23,248(%%r1,%3),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchedb %%v4,%%v17,%%v16 \n\t" - "vfchedb %%v5,%%v19,%%v18 \n\t" - "vsel %%v16,%%v16,%%v17,%%v4 \n\t" - "vsel %%v4,%%v24,%%v25,%%v4 \n\t" - "vsel %%v17,%%v18,%%v19,%%v5 \n\t" - "vsel %%v5,%%v26,%%v27,%%v5 \n\t" - - "vfchedb %%v18,%%v17,%%v16 \n\t" - "vsel %%v16,%%v16,%%v17,%%v18 \n\t" - "vsel %%v4,%%v4,%%v5,%%v18 \n\t" - "vag %%v4,%%v4,%%v3 \n\t" - - "vfchedb %%v5,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v5 \n\t" - "vsel %%v1,%%v1,%%v4,%%v5 \n\t" - "vag %%v3,%%v3,%%v2 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v2,%%v0,1 \n\t" - "vrepg %%v3,%%v1,1 \n\t" - "wfcdb %%v2,%%v0 \n\t" - "jne 1f \n\t" - "vsteg %%v0,%1,0 \n\t" - "vmnlg %%v0,%%v1,%%v3 \n\t" - "vlgvg %0,%%v0,0 \n\t" - "j 2f \n\t" - "1: \n\t" - "wfchdb %%v4,%%v0,%%v2 \n\t" - "vsel %%v1,%%v3,%%v1,%%v4 \n\t" - "vsel %%v0,%%v2,%%v0,%%v4 \n\t" - "std %%f0,%1 \n\t" - "vlgvg %0,%%v1,0 \n\t" - "2: \n\t" - "nop " - :"=r"(iamin),"=m"(*amin) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return iamin; + +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { + BLASLONG iamin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + + return iamin; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0; + BLASLONG min = 0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (min); + + if (inc_x == 1) { - if (n <= 0 || inc_x <= 0) return(min); - - if (inc_x == 1) { + BLASLONG n1 = n & -16; + if (n1 > 0) { - BLASLONG n1 = n & -16; - if (n1 > 0) { + min = izamin_kernel_16(n1, x, &minf); + ix = n1 * 2; + i = n1; + } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } - min = izamin_kernel_16(n1, x, &minf); - ix = n1 * 2; - i = n1; + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); } - else - { - minf = CABS1(x,0); - ix += 2; - i++; - } - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += 2; - i++; + ix += 2; + i++; } - return (min + 1); + return (min + 1); + + } else { - } else { - min = 0; - minf = CABS1(x,0); + minf = CABS1(x, 0); inc_x2 = 2 * inc_x; - ix += inc_x2; - i++; - - while(i < n) - { - if( CABS1(x,ix) < minf ) - { - min = i; - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + min = i + 1; + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + 2 * inc_x2) < minf) { + min = i + 2; + minf = CABS1(x, ix + 2 * inc_x2); + } + if (CABS1(x, ix + 3 * inc_x2) < minf) { + min = i + 3; + minf = CABS1(x, ix + 3 * inc_x2); + } + + ix += inc_x2 * 4; + + i += 4; + } - return (min + 1); + + while (i < n) { + if (CABS1(x, ix) < minf) { + min = i; + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (min + 1); + } } - - diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index b629d64c0..efbc0318c 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmaxsb %%v16,%%v16,%%v24,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v25,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v26,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v27,8 \n\t" - "vfmaxsb %%v20,%%v20,%%v28,8 \n\t" - "vfmaxsb %%v21,%%v21,%%v29,8 \n\t" - "vfmaxsb %%v22,%%v22,%%v30,8 \n\t" - "vfmaxsb %%v23,%%v23,%%v31,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v20,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v21,8 \n\t" - "vfmaxsb %%v18,%%v18,%%v22,8 \n\t" - "vfmaxsb %%v19,%%v19,%%v23,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v18,8 \n\t" - "vfmaxsb %%v17,%%v17,%%v19,8 \n\t" - - "vfmaxsb %%v16,%%v16,%%v17,8 \n\t" - - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfmaxsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfmaxsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - if (n <= 0 || inc_x <= 0) return (maxf); +static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT maxf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = samax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); + maxf = samax_kernel_64(n1, x); + i = n1; } else { + maxf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i++; + } + return (maxf); - maxf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = ABS(x[0]); - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + maxf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (ABS(x[i]) > maxf) { + maxf = ABS(x[i]); + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 7ce6ee657..138836ce5 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,142 +28,125 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else #define ABS fabsf -#endif - -static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,8 \n\t" - "vfminsb %%v17,%%v17,%%v25,8 \n\t" - "vfminsb %%v18,%%v18,%%v26,8 \n\t" - "vfminsb %%v19,%%v19,%%v27,8 \n\t" - "vfminsb %%v20,%%v20,%%v28,8 \n\t" - "vfminsb %%v21,%%v21,%%v29,8 \n\t" - "vfminsb %%v22,%%v22,%%v30,8 \n\t" - "vfminsb %%v23,%%v23,%%v31,8 \n\t" - - "vfminsb %%v16,%%v16,%%v20,8 \n\t" - "vfminsb %%v17,%%v17,%%v21,8 \n\t" - "vfminsb %%v18,%%v18,%%v22,8 \n\t" - "vfminsb %%v19,%%v19,%%v23,8 \n\t" - - "vfminsb %%v16,%%v16,%%v18,8 \n\t" - "vfminsb %%v17,%%v17,%%v19,8 \n\t" - - "vfminsb %%v16,%%v16,%%v17,8 \n\t" - - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,8 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,8 \n\t" - "lper %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - if (n <= 0 || inc_x <= 0) return (minf); +static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; +} - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = samin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=ABS(x[0]); - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); + minf = samin_kernel_64(n1, x); + i = n1; } else { + minf = ABS(x[0]); + i++; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i++; + } + return (minf); - minf=ABS(x[0]); + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = ABS(x[0]); - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + minf = ABS(x[i + 3 * inc_x]); + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (ABS(x[i]) < minf) { + minf = ABS(x[i]); + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 2c59ab2e5..0c3057a92 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,147 +28,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpsb %%v16, %%v16 \n\t" - "vflpsb %%v17, %%v17 \n\t" - "vflpsb %%v18, %%v18 \n\t" - "vflpsb %%v19, %%v19 \n\t" - "vflpsb %%v20, %%v20 \n\t" - "vflpsb %%v21, %%v21 \n\t" - "vflpsb %%v22, %%v22 \n\t" - "vflpsb %%v23, %%v23 \n\t" - - "vfasb %%v0,%%v0,%%v16 \n\t" - "vfasb %%v1,%%v1,%%v17 \n\t" - "vfasb %%v2,%%v2,%%v18 \n\t" - "vfasb %%v3,%%v3,%%v19 \n\t" - "vfasb %%v0,%%v0,%%v20 \n\t" - "vfasb %%v1,%%v1,%%v21 \n\t" - "vfasb %%v2,%%v2,%%v22 \n\t" - "vfasb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vfasb %%v0,%%v0,%%v3 \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepf %%v1,%%v0,2 \n\t" - "aebr %%f0,%%f1 \n\t" - "ler %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +#define ABS fabsf + +static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return sumf; - sumf = sasum_kernel_64(n1, x); - i = n1; - } + if (inc_x == 1) { - while (i < n) { - sumf += ABS(x[i]); - i++; - } + n1 = n & -64; - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + if (n1 > 0) { - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + sumf = sasum_kernel_64(n1, x); + i = n1; + } - i += inc_x * 4; - j += 4; + while (i < n) { + sumf += ABS(x[i]); + i++; + } - } - sumf = sum1 + sum2; - while (j < n) { + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - sumf += ABS(x[i]); - i += inc_x; - j++; - } + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; } - return sumf; -} + sumf = sum1 + sum2; + while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } + } + return sumf; +} diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index 26ead310c..e41e87af0 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,158 +27,141 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( - "vlrepf %%v0,%3 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,80(%%r1,%1) \n\t" - "vl %%v26,96(%%r1,%1) \n\t" - "vl %%v27,112(%%r1,%1) \n\t" - "vl %%v28,64(%%r1,%2) \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vl %%v30,96(%%r1,%2) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "vl %%v16,128(%%r1,%1) \n\t" - "vl %%v17,144(%%r1,%1) \n\t" - "vl %%v18,160(%%r1,%1) \n\t" - "vl %%v19,176(%%r1,%1) \n\t" - "vl %%v20,128(%%r1,%2) \n\t" - "vl %%v21,144(%%r1,%2) \n\t" - "vl %%v22,160(%%r1,%2) \n\t" - "vl %%v23,176(%%r1,%2) \n\t" - - "vfmasb %%v16,%%v0,%%v16,%%v20 \n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21 \n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22 \n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23 \n\t" - - "vl %%v24,192(%%r1,%1) \n\t" - "vl %%v25,208(%%r1,%1) \n\t" - "vl %%v26,224(%%r1,%1) \n\t" - "vl %%v27,240(%%r1,%1) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfmasb %%v20,%%v0,%%v24,%%v28 \n\t" - "vfmasb %%v21,%%v0,%%v25,%%v29 \n\t" - "vfmasb %%v22,%%v0,%%v26,%%v30 \n\t" - "vfmasb %%v23,%%v0,%%v27,%%v31 \n\t" - - "vst %%v16,128(%%r1,%2) \n\t" - "vst %%v17,144(%%r1,%2) \n\t" - "vst %%v18,160(%%r1,%2) \n\t" - "vst %%v19,176(%%r1,%2) \n\t" - "vst %%v20,192(%%r1,%2) \n\t" - "vst %%v21,208(%%r1,%2) \n\t" - "vst %%v22,224(%%r1,%2) \n\t" - "vst %%v23,240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__("vlrepf %%v0,%[alpha]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), + [alpha] "m"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return 0 ; + if (n <= 0) + return 0; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -64; + BLASLONG n1 = n & -64; - if ( n1 ) - saxpy_kernel_64(n1, x, y , &da); + if (n1) + saxpy_kernel_64(n1, x, y, &da); - i = n1; - while(i < n) - { - - y[i] += da * x[i] ; - i++ ; - - } - return 0 ; + i = n1; + while (i < n) { + y[i] += da * x[i]; + i++; } + return 0; - BLASLONG n1 = n & -4; + } - while(i < n1) - { + BLASLONG n1 = n & -4; - FLOAT m1 = da * x[ix] ; - FLOAT m2 = da * x[ix+inc_x] ; - FLOAT m3 = da * x[ix+2*inc_x] ; - FLOAT m4 = da * x[ix+3*inc_x] ; + while (i < n1) { - y[iy] += m1 ; - y[iy+inc_y] += m2 ; - y[iy+2*inc_y] += m3 ; - y[iy+3*inc_y] += m4 ; + FLOAT m1 = da * x[ix]; + FLOAT m2 = da * x[ix + inc_x]; + FLOAT m3 = da * x[ix + 2 * inc_x]; + FLOAT m4 = da * x[ix + 3 * inc_x]; - ix += inc_x*4 ; - iy += inc_y*4 ; - i+=4 ; + y[iy] += m1; + y[iy + inc_y] += m2; + y[iy + 2 * inc_y] += m3; + y[iy + 3 * inc_y] += m4; - } + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; - while(i < n) - { + } - y[iy] += da * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { - } - return 0 ; - -} + y[iy] += da * x[ix]; + ix += inc_x; + iy += inc_y; + i++; + } + return 0; +} diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index ff4227595..44d27b062 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,59 +27,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,6 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n])x),"a"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","r2" - ); +static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n]) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if (n <= 0) return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - scopy_kernel_64(n1, x, y); - i = n1; - } + if (n <= 0) + return 0; - while (i < n) { - y[i] = x[i]; - i++; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + scopy_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + y[i] = x[i]; + i++; - } else { + } - while (i < n) { + } else { - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + while (i < n) { - } + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; } - return 0; + } + return 0; } diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index 5ddbc69bd..f659b0c8a 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018,The OpenBLAS Project +Copyright (c) 2013-2019,The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms,with or without modification,are permitted provided that the following conditions are @@ -27,114 +27,118 @@ USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) -{ - FLOAT dot; - - __asm__ volatile ( - "vzero %%v0 \n\t" - "srlg %%r0,%1,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%3) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,16(%%r1,%3) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - "vl %%v27,48(%%r1,%3) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - "vl %%v28,64(%%r1,%3) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%3) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - "vl %%v30,96(%%r1,%3) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%3) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vrepf %%v1,%%v0,1 \n\t" - "vrepf %%v2,%%v0,2 \n\t" - "vrepf %%v3,%%v0,3 \n\t" - "aebr %%f0,%%f1 \n\t" - "aebr %%f0,%%f2 \n\t" - "aebr %%f0,%%f3 \n\t" - "ler %0,%%f0 " - :"=f"(dot) - :"r"(n),"ZR"((const FLOAT (*)[n])x),"ZR"((const FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return dot; +static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { + FLOAT dot; + + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), + [y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); + + return dot; } -FLOAT CNAME(BLASLONG n,FLOAT *x,BLASLONG inc_x,FLOAT *y,BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0 ; + FLOAT dot = 0.0; - if ( n <= 0 ) return(dot); + if (n <= 0) + return (dot); - if ( (inc_x == 1) && (inc_y == 1) ) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; + BLASLONG n1 = n & -32; - if ( n1 ) - dot = sdot_kernel_32(n1,x,y); + if (n1) + dot = sdot_kernel_32(n1, x, y); - i = n1; - while(i < n) - { + i = n1; + while (i < n) { - dot += y[i] * x[i] ; - i++ ; + dot += y[i] * x[i]; + i++; - } - return(dot); + } + return (dot); + } - } + BLASLONG n1 = n & -2; - BLASLONG n1 = n & -2; + while (i < n1) { - while(i < n1) - { + dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; + ix += inc_x * 2; + iy += inc_y * 2; + i += 2; - dot += y[iy] * x[ix] + y[iy+inc_y] * x[ix+inc_x]; - ix += inc_x*2 ; - iy += inc_y*2 ; - i+=2 ; + } - } + while (i < n) { - while(i < n) - { + dot += y[iy] * x[ix]; + ix += inc_x; + iy += inc_y; + i++; - dot += y[iy] * x[ix] ; - ix += inc_x ; - iy += inc_y ; - i++ ; - - } - return(dot); + } + return (dot); } - - diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 01d8414de..86ac24993 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,640 +29,559 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%5) \n\t" - "vlrepf %%v1,4(%5) \n\t" - "vlrepf %%v2,8(%5) \n\t" - "vlrepf %%v3,12(%5) \n\t" - "vlrepf %%v4,%7 \n\t" - "vfmsb %%v0,%%v0,%%v4 \n\t" - "vfmsb %%v1,%%v1,%%v4 \n\t" - "vfmsb %%v2,%%v2,%%v4 \n\t" - "vfmsb %%v3,%%v3,%%v4 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - "vl %%v20,16(%%r1,%1) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,16(%%r1,%3) \n\t" - "vl %%v23,16(%%r1,%4) \n\t" - "vl %%v24,32(%%r1,%1) \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vl %%v28,48(%%r1,%1) \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "vl %%v4,16(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,16(%%r1,%6) \n\t" - - "vl %%v4,32(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,32(%%r1,%6) \n\t" - - "vl %%v4,48(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,48(%%r1,%6) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,64(%%r1,%2) \n\t" - "vl %%v18,64(%%r1,%3) \n\t" - "vl %%v19,64(%%r1,%4) \n\t" - "vl %%v20,80(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,80(%%r1,%3) \n\t" - "vl %%v23,80(%%r1,%4) \n\t" - "vl %%v24,96(%%r1,%1) \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vl %%v28,112(%%r1,%1) \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - - "vl %%v4,64(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,64(%%r1,%6) \n\t" - - "vl %%v4,80(%%r1,%6) \n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v22,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v23,%%v3,%%v4 \n\t" - "vst %%v4,80(%%r1,%6) \n\t" - - "vl %%v4,96(%%r1,%6) \n\t" - "vfmasb %%v4,%%v24,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v25,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v26,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v27,%%v3,%%v4 \n\t" - "vst %%v4,96(%%r1,%6) \n\t" - - "vl %%v4,112(%%r1,%6) \n\t" - "vfmasb %%v4,%%v28,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v29,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v30,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v31,%%v3,%%v4 \n\t" - "vst %%v4,112(%%r1,%6) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,0(%%r1,%3) \n\t" - "vl %%v19,0(%%r1,%4) \n\t" - - "vl %%v4,0(%%r1,%6) \n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4 \n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4 \n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4 \n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4 \n\t" - "vst %%v4,0(%%r1,%6) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZQ"((const FLOAT (*)[4])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%3) \n\t" - "vlrepf %%v1,4(%3) \n\t" - "vlrepf %%v2,%5 \n\t" - "vfmsb %%v0,%%v0,%%v2 \n\t" - "vfmsb %%v1,%%v1,%%v2 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - "vl %%v18,16(%%r1,%1) \n\t" - "vl %%v19,16(%%r1,%2) \n\t" - "vl %%v20,32(%%r1,%1) \n\t" - "vl %%v21,32(%%r1,%2) \n\t" - "vl %%v22,48(%%r1,%1) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vl %%v24,64(%%r1,%1) \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vl %%v26,80(%%r1,%1) \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vl %%v28,96(%%r1,%1) \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vl %%v30,112(%%r1,%1) \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "vl %%v2,16(%%r1,%4) \n\t" - "vfmasb %%v2,%%v18,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v19,%%v1,%%v2 \n\t" - "vst %%v2,16(%%r1,%4) \n\t" - - "vl %%v2,32(%%r1,%4) \n\t" - "vfmasb %%v2,%%v20,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v21,%%v1,%%v2 \n\t" - "vst %%v2,32(%%r1,%4) \n\t" - - "vl %%v2,48(%%r1,%4) \n\t" - "vfmasb %%v2,%%v22,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v23,%%v1,%%v2 \n\t" - "vst %%v2,48(%%r1,%4) \n\t" - - "vl %%v2,64(%%r1,%4) \n\t" - "vfmasb %%v2,%%v24,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v25,%%v1,%%v2 \n\t" - "vst %%v2,64(%%r1,%4) \n\t" - - "vl %%v2,80(%%r1,%4) \n\t" - "vfmasb %%v2,%%v26,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v27,%%v1,%%v2 \n\t" - "vst %%v2,80(%%r1,%4) \n\t" - - "vl %%v2,96(%%r1,%4) \n\t" - "vfmasb %%v2,%%v28,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v29,%%v1,%%v2 \n\t" - "vst %%v2,96(%%r1,%4) \n\t" - - "vl %%v2,112(%%r1,%4) \n\t" - "vfmasb %%v2,%%v30,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v31,%%v1,%%v2 \n\t" - "vst %%v2,112(%%r1,%4) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,0(%%r1,%2) \n\t" - - "vl %%v2,0(%%r1,%4) \n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2 \n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2 \n\t" - "vst %%v2,0(%%r1,%4) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZQ"((const FLOAT (*)[2])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile ( - "vlrepf %%v0,0(%2) \n\t" - "vlrepf %%v1,%4 \n\t" - "vfmsb %%v0,%%v0,%%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%1) \n\t" - "vl %%v21,80(%%r1,%1) \n\t" - "vl %%v22,96(%%r1,%1) \n\t" - "vl %%v23,112(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "vl %%v1,16(%%r1,%3) \n\t" - "vfmasb %%v1,%%v17,%%v0,%%v1 \n\t" - "vst %%v1,16(%%r1,%3) \n\t" - - "vl %%v1,32(%%r1,%3) \n\t" - "vfmasb %%v1,%%v18,%%v0,%%v1 \n\t" - "vst %%v1,32(%%r1,%3) \n\t" - - "vl %%v1,48(%%r1,%3) \n\t" - "vfmasb %%v1,%%v19,%%v0,%%v1 \n\t" - "vst %%v1,48(%%r1,%3) \n\t" - - "vl %%v1,64(%%r1,%3) \n\t" - "vfmasb %%v1,%%v20,%%v0,%%v1 \n\t" - "vst %%v1,64(%%r1,%3) \n\t" - - "vl %%v1,80(%%r1,%3) \n\t" - "vfmasb %%v1,%%v21,%%v0,%%v1 \n\t" - "vst %%v1,80(%%r1,%3) \n\t" - - "vl %%v1,96(%%r1,%3) \n\t" - "vfmasb %%v1,%%v22,%%v0,%%v1 \n\t" - "vst %%v1,96(%%r1,%3) \n\t" - - "vl %%v1,112(%%r1,%3) \n\t" - "vfmasb %%v1,%%v23,%%v0,%%v1 \n\t" - "vst %%v1,112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%1) \n\t" - - "vl %%v1,0(%%r1,%3) \n\t" - "vfmasb %%v1,%%v16,%%v0,%%v1 \n\t" - "vst %%v1,0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZQ"((const FLOAT (*)[1])xo),"ZR"((FLOAT (*)[n])y),"m"(*alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, + FLOAT *alpha) { + __asm__("vlrepf %%v0,0(%[x])\n\t" + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i]; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i]; + dest += inc_dest; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8],*ybuffer; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - ybuffer = buffer; - - n1 = n >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, + BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *buffer) { + BLASLONG i; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[4]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4 = lda << 2; + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + ybuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; } - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if (inc_y != 1) + memset(ybuffer, 0, NB * 4); + else + ybuffer = y_ptr; + + if (inc_x == 1) { + + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if (n2 & 2) { + sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); + a_ptr += lda * 2; + x_ptr += 2; + } + + if (n2 & 1) { + sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); + /* a_ptr += lda; + x_ptr += 1; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); + a_ptr += lda; + + } + } + a += NB; + if (inc_y != 1) { + add_y(NB, ybuffer, y_ptr, inc_y); + y_ptr += NB * inc_y; + } else + y_ptr += NB; + + } + + if (m3 == 0) + return (0); + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if (lda == 3 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if (lda == 2 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + for (; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr++; + } + + } else { + + for (i = 0; i < n; i++) { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + } - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return (0); + } + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if (lda == 1 && inc_x == 1) { + + for (i = 0; i < (n & -4); i += 4) { + temp += + a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + + 2] * + x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; + + } + + for (; i < n; i++) { + temp += a_ptr[i] * x_ptr[i]; + } + + } else { + + for (i = 0; i < n; i++) { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + } + y_ptr[0] += alpha * temp; + return (0); + } - return(0); + return (0); } - - diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index a3136723a..6ae9b6d7f 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -29,783 +29,717 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 1,1024(%%r1,%5) \n\t" - - "vl %%v16,0(%%r1,%5) \n\t" - "vl %%v17,16(%%r1,%5) \n\t" - "vl %%v18,32(%%r1,%5) \n\t" - "vl %%v19,48(%%r1,%5) \n\t" - "vl %%v20,64(%%r1,%5) \n\t" - "vl %%v21,80(%%r1,%5) \n\t" - "vl %%v22,96(%%r1,%5) \n\t" - "vl %%v23,112(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "vl %%v28,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v28,%%v0 \n\t" - "vl %%v29,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v29,%%v1 \n\t" - "vl %%v30,16(%%r1,%3) \n\t" - "vfmasb %%v2,%%v17,%%v30,%%v2 \n\t" - "vl %%v31,16(%%r1,%4) \n\t" - "vfmasb %%v3,%%v17,%%v31,%%v3 \n\t" - - "vl %%v24,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0 \n\t" - "vl %%v25,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1 \n\t" - "vl %%v26,32(%%r1,%3) \n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2 \n\t" - "vl %%v27,32(%%r1,%4) \n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3 \n\t" - - "vl %%v28,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v28,%%v0 \n\t" - "vl %%v29,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v29,%%v1 \n\t" - "vl %%v30,48(%%r1,%3) \n\t" - "vfmasb %%v2,%%v19,%%v30,%%v2 \n\t" - "vl %%v31,48(%%r1,%4) \n\t" - "vfmasb %%v3,%%v19,%%v31,%%v3 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - "vl %%v26,64(%%r1,%3) \n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2 \n\t" - "vl %%v27,64(%%r1,%4) \n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3 \n\t" - - "vl %%v28,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v28,%%v0 \n\t" - "vl %%v29,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v29,%%v1 \n\t" - "vl %%v30,80(%%r1,%3) \n\t" - "vfmasb %%v2,%%v21,%%v30,%%v2 \n\t" - "vl %%v31,80(%%r1,%4) \n\t" - "vfmasb %%v3,%%v21,%%v31,%%v3 \n\t" - - "vl %%v24,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0 \n\t" - "vl %%v25,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1 \n\t" - "vl %%v26,96(%%r1,%3) \n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2 \n\t" - "vl %%v27,96(%%r1,%4) \n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3 \n\t" - - "vl %%v28,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v28,%%v0 \n\t" - "vl %%v29,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v29,%%v1 \n\t" - "vl %%v30,112(%%r1,%3) \n\t" - "vfmasb %%v2,%%v23,%%v30,%%v2 \n\t" - "vl %%v31,112(%%r1,%4) \n\t" - "vfmasb %%v3,%%v23,%%v31,%%v3 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%5) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - "vl %%v26,0(%%r1,%3) \n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2 \n\t" - "vl %%v27,0(%%r1,%4) \n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v4,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v4 \n\t" - "vrepg %%v4,%%v0,1 \n\t" - "aebr %%f0,%%f4 \n\t" - "ste %%f0,0(%6) \n\t" - "veslg %%v4,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v4 \n\t" - "vrepg %%v4,%%v1,1 \n\t" - "aebr %%f1,%%f4 \n\t" - "ste %%f1,4(%6) \n\t" - "veslg %%v4,%%v2,32 \n\t" - "vfasb %%v2,%%v2,%%v4 \n\t" - "vrepg %%v4,%%v2,1 \n\t" - "aebr %%f2,%%f4 \n\t" - "ste %%f2,8(%6) \n\t" - "veslg %%v4,%%v3,32 \n\t" - "vfasb %%v3,%%v3,%%v4 \n\t" - "vrepg %%v4,%%v3,1 \n\t" - "aebr %%f3,%%f4 \n\t" - "ste %%f3,12(%6) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])ap[2]),"ZR"((const FLOAT (*)[n])ap[3]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[4])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(FLOAT (*)[4]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%3) \n\t" - "vl %%v17,16(%%r1,%3) \n\t" - "vl %%v18,32(%%r1,%3) \n\t" - "vl %%v19,48(%%r1,%3) \n\t" - "vl %%v20,64(%%r1,%3) \n\t" - "vl %%v21,80(%%r1,%3) \n\t" - "vl %%v22,96(%%r1,%3) \n\t" - "vl %%v23,112(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "vl %%v26,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v26,%%v0 \n\t" - "vl %%v27,16(%%r1,%2) \n\t" - "vfmasb %%v1,%%v17,%%v27,%%v1 \n\t" - - "vl %%v28,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v28,%%v0 \n\t" - "vl %%v29,32(%%r1,%2) \n\t" - "vfmasb %%v1,%%v18,%%v29,%%v1 \n\t" - - "vl %%v30,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v30,%%v0 \n\t" - "vl %%v31,48(%%r1,%2) \n\t" - "vfmasb %%v1,%%v19,%%v31,%%v1 \n\t" - - "vl %%v24,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0 \n\t" - "vl %%v25,64(%%r1,%2) \n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1 \n\t" - - "vl %%v26,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v26,%%v0 \n\t" - "vl %%v27,80(%%r1,%2) \n\t" - "vfmasb %%v1,%%v21,%%v27,%%v1 \n\t" - - "vl %%v28,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v28,%%v0 \n\t" - "vl %%v29,96(%%r1,%2) \n\t" - "vfmasb %%v1,%%v22,%%v29,%%v1 \n\t" - - "vl %%v30,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v30,%%v0 \n\t" - "vl %%v31,112(%%r1,%2) \n\t" - "vfmasb %%v1,%%v23,%%v31,%%v1 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%3) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - "vl %%v25,0(%%r1,%2) \n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v2,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v2 \n\t" - "vrepg %%v2,%%v0,1 \n\t" - "aebr %%f0,%%f2 \n\t" - "ste %%f0,0(%4) \n\t" - "veslg %%v2,%%v1,32 \n\t" - "vfasb %%v1,%%v1,%%v2 \n\t" - "vrepg %%v2,%%v1,1 \n\t" - "aebr %%f1,%%f2 \n\t" - "ste %%f1,4(%4) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])ap[0]),"ZR"((const FLOAT (*)[n])ap[1]),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[2])y) - :"memory","cc","r0","r1","v0","v1","v2","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(FLOAT (*)[2]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vzero %%v0 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "vl %%v25,16(%%r1,%1) \n\t" - "vfmasb %%v0,%%v17,%%v25,%%v0 \n\t" - - "vl %%v26,32(%%r1,%1) \n\t" - "vfmasb %%v0,%%v18,%%v26,%%v0 \n\t" - - "vl %%v27,48(%%r1,%1) \n\t" - "vfmasb %%v0,%%v19,%%v27,%%v0 \n\t" - - "vl %%v28,64(%%r1,%1) \n\t" - "vfmasb %%v0,%%v20,%%v28,%%v0 \n\t" - - "vl %%v29,80(%%r1,%1) \n\t" - "vfmasb %%v0,%%v21,%%v29,%%v0 \n\t" - - "vl %%v30,96(%%r1,%1) \n\t" - "vfmasb %%v0,%%v22,%%v30,%%v0 \n\t" - - "vl %%v31,112(%%r1,%1) \n\t" - "vfmasb %%v0,%%v23,%%v31,%%v0 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24,0(%%r1,%1) \n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0 \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "veslg %%v1,%%v0,32 \n\t" - "vfasb %%v0,%%v0,%%v1 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "aebr %%f0,%%f1 \n\t" - "ste %%f0,0(%3) " - : - :"r"(n),"ZR"((const FLOAT (*)[n])a0),"ZR"((const FLOAT (*)[n])x),"ZQ"((FLOAT (*)[1])y) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { + __asm__("vzero %%v0\n\t" + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) -{ - BLASLONG i; - for (i = 0; i < n; i++) - { - dest[i] = *src; - src += inc_src; - } + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + dest[i] = *src; + src += inc_src; + } } - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "xgr %%r1,%%r1 \n\t" - - "lghi %%r0,-32 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 1f \n\t" - - "srlg %%r0,%%r0,5 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - "vl %%v25, 16(%%r1,%3) \n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25 \n\t" - "vst %%v25, 16(%%r1,%3) \n\t" - "vl %%v26, 32(%%r1,%3) \n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26 \n\t" - "vst %%v26, 32(%%r1,%3) \n\t" - "vl %%v27, 48(%%r1,%3) \n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27 \n\t" - "vst %%v27, 48(%%r1,%3) \n\t" - "vl %%v28, 64(%%r1,%3) \n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28 \n\t" - "vst %%v28, 64(%%r1,%3) \n\t" - "vl %%v29, 80(%%r1,%3) \n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29 \n\t" - "vst %%v29, 80(%%r1,%3) \n\t" - "vl %%v30, 96(%%r1,%3) \n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30 \n\t" - "vst %%v30, 96(%%r1,%3) \n\t" - "vl %%v31, 112(%%r1,%3) \n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31 \n\t" - "vst %%v31, 112(%%r1,%3) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - - "1: \n\t" - "lghi %%r0,28 \n\t" - "ngr %%r0,%0 \n\t" - "ltgr %%r0,%%r0 \n\t" - "jz 3f \n\t" - - "srlg %%r0,%%r0,2 \n\t" - "2: \n\t" - "vl %%v16,0(%%r1,%2) \n\t" - - "vl %%v24, 0(%%r1,%3) \n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24 \n\t" - "vst %%v24, 0(%%r1,%3) \n\t" - - "agfi %%r1,16 \n\t" - "brctg %%r0,2b \n\t" - - "3: \n\t" - "nop " - : - :"r"(n),"m"(da),"ZR"((const FLOAT (*)[n])src),"ZR"((FLOAT (*)[n])dest) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + +static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { + __asm__("vlrepf %%v0,%[da]\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(FLOAT (*)[n]) dest) + : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else - { - BLASLONG i; - for (i = 0; i < n; i++) - { - *dest += src[i] * da; - dest += inc_dest; - } +static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, + BLASLONG inc_dest) { + if (inc_dest == 1) + add_y_kernel_4(n, da, src, dest); + else { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest += src[i] * da; + dest += inc_dest; } + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) -{ - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if ( m < 1 ) return(0); - if ( n < 1 ) return(0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if ( inc_x == 1 ) - xbuffer = x_ptr; - else - copy_x(NB,x_ptr,xbuffer,inc_x); - - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( n0 > 0 ) - { - BLASLONG nb1 = NBMAX / 4; - for( j=0; j> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m & -4; + m2 = (m & (NBMAX - 1)) - m3; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } - } + y_ptr = y; + a_ptr = a; + x_ptr = x; + if (inc_x == 1) + xbuffer = x_ptr; + else + copy_x(NB, x_ptr, xbuffer, inc_x); - yp = ytemp; + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x4(NB,ap,xbuffer,yp); - ap[0] += lda4 ; - ap[1] += lda4 ; - ap[2] += lda4 ; - ap[3] += lda4 ; - yp += 4; - } - if ( n1 > 0 ) - { - add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4 ; + if (n0 > 0) { + BLASLONG nb1 = NBMAX / 4; + for (j = 0; j < n0; j++) { + + yp = ytemp; + for (i = 0; i < nb1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; } + add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += nb1 * inc_y * 4; + a_ptr += nb1 * lda4; - if ( n2 & 2 ) - { + } - sgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + } - } + yp = ytemp; - if ( n2 & 1 ) - { + for (i = 0; i < n1; i++) { + sgemv_kernel_4x4(NB, ap, xbuffer, yp); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + yp += 4; + } + if (n1 > 0) { + add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4; + } - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + if (n2 & 2) { + + sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - } - a += NB; - x += NB * inc_x; } - if ( m3 == 0 ) return(0); + if (n2 & 1) { - x_ptr = x; - a_ptr = a; - if ( m3 == 3 ) - { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if ( lda == 3 && inc_y == 1 ) - { - - for ( j=0; j< ( n & -4) ; j+=4 ) - { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for ( ; j 0) { + if (n <= 0 || inc_x <= 0) + return (maxf); - maxf = smax_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - maxf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); + maxf = smax_kernel_64(n1, x); + i = n1; } else { + maxf = x[0]; + i++; + } - maxf=x[0]; + while (i < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i++; + } + return (maxf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + maxf = x[0]; - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] > maxf) { + maxf = x[i]; + } + if (x[i + inc_x] > maxf) { + maxf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] > maxf) { + maxf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] > maxf) { + maxf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); + while (j < n) { + if (x[i] > maxf) { + maxf = x[i]; + } + i += inc_x; + j++; } + return (maxf); + } } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index e7d83441b..2e9c793c4 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,136 +27,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) -{ - FLOAT min; - - __asm__ volatile ( - "vl %%v0,0(%2) \n\t" - "srlg %%r0,%1,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vl %%v24,128(%%r1,%2) \n\t" - "vl %%v25,144(%%r1,%2) \n\t" - "vl %%v26,160(%%r1,%2) \n\t" - "vl %%v27,176(%%r1,%2) \n\t" - "vl %%v28,192(%%r1,%2) \n\t" - "vl %%v29,208(%%r1,%2) \n\t" - "vl %%v30,224(%%r1,%2) \n\t" - "vl %%v31,240(%%r1,%2) \n\t" - - "vfminsb %%v16,%%v16,%%v24,0 \n\t" - "vfminsb %%v17,%%v17,%%v25,0 \n\t" - "vfminsb %%v18,%%v18,%%v26,0 \n\t" - "vfminsb %%v19,%%v19,%%v27,0 \n\t" - "vfminsb %%v20,%%v20,%%v28,0 \n\t" - "vfminsb %%v21,%%v21,%%v29,0 \n\t" - "vfminsb %%v22,%%v22,%%v30,0 \n\t" - "vfminsb %%v23,%%v23,%%v31,0 \n\t" - - "vfminsb %%v16,%%v16,%%v20,0 \n\t" - "vfminsb %%v17,%%v17,%%v21,0 \n\t" - "vfminsb %%v18,%%v18,%%v22,0 \n\t" - "vfminsb %%v19,%%v19,%%v23,0 \n\t" - - "vfminsb %%v16,%%v16,%%v18,0 \n\t" - "vfminsb %%v17,%%v17,%%v19,0 \n\t" - - "vfminsb %%v16,%%v16,%%v17,0 \n\t" - - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "veslg %%v16,%%v0,32 \n\t" - "vfminsb %%v0,%%v0,%%v16,0 \n\t" - - "vrepf %%v16,%%v0,2 \n\t" - "wfminsb %%v0,%%v0,%%v16,0 \n\t" - "ler %0,%%f0 " - :"=f"(min) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return min; +static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { + FLOAT min; + + __asm__("vl %%v0,0(%[x])\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return min; } - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) return (minf); - if (inc_x == 1) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT minf = 0.0; - BLASLONG n1 = n & -64; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) + return (minf); - minf = smin_kernel_64(n1, x); + if (inc_x == 1) { - i = n1; - } - else - { - minf=x[0]; - i++; - } + BLASLONG n1 = n & -64; + if (n1 > 0) { - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); + minf = smin_kernel_64(n1, x); + i = n1; } else { + minf = x[0]; + i++; + } - minf=x[0]; + while (i < n) { + if (x[i] < minf) { + minf = x[i]; + } + i++; + } + return (minf); + + } else { - BLASLONG n1 = n & -4; - while (j < n1) { + minf = x[0]; - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } + BLASLONG n1 = n & -4; + while (j < n1) { - i += inc_x * 4; + if (x[i] < minf) { + minf = x[i]; + } + if (x[i + inc_x] < minf) { + minf = x[i + inc_x]; + } + if (x[i + 2 * inc_x] < minf) { + minf = x[i + 2 * inc_x]; + } + if (x[i + 3 * inc_x] < minf) { + minf = x[i + 3 * inc_x]; + } - j += 4; + i += inc_x * 4; - } + j += 4; + } - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); + while (j < n) { + if (x[i] < minf) { + minf = x[i]; + } + i += inc_x; + j++; } + return (minf); + } } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 763cc664a..5b21a19dc 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,220 +27,199 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepf %%v0,%3 \n\t" - "vlrepf %%v1,%4 \n\t" - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmsb %%v28,%%v24,%%v0 \n\t" - "vfmsb %%v29,%%v25,%%v0 \n\t" - "vfmsb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0 \n\t" - "vfmsb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0 \n\t" - "vfmsb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmasb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepf %%v0,%[c]\n\t" + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - - FLOAT temp; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + FLOAT temp; - if ( (inc_x == 1) && (inc_y == 1) ) - { + if (n <= 0) + return (0); - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - srot_kernel_64(n1, x, y, &cosa, &sina); - i=n1; - } + if ((inc_x == 1) && (inc_y == 1)) { - while(i < n) - { - temp = c*x[i] + s*y[i] ; - y[i] = c*y[i] - s*x[i] ; - x[i] = temp ; - - i++ ; + BLASLONG n1 = n & -64; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + srot_kernel_64(n1, x, y, &cosa, &sina); + i = n1; + } - } + while (i < n) { + temp = c * x[i] + s * y[i]; + y[i] = c * y[i] - s * x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = c*x[ix] + s*y[iy] ; - y[iy] = c*y[iy] - s*x[ix] ; - x[ix] = temp ; + } else { - ix += inc_x ; - iy += inc_y ; - i++ ; + while (i < n) { + temp = c * x[ix] + s * y[iy]; + y[iy] = c * y[iy] - s * x[ix]; + x[ix] = temp; - } + ix += inc_x; + iy += inc_y; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index c18a7e56f..07e6845c6 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,175 +27,147 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) -{ - __asm__ volatile ( - "vlrepf %%v0,%1 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 0(%%r1,%2) \n\t" - "vl %%v25, 16(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 16(%%r1,%2) \n\t" - "vl %%v26, 32(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 32(%%r1,%2) \n\t" - "vl %%v27, 48(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 48(%%r1,%2) \n\t" - "vl %%v24, 64(%%r1,%2) \n\t" - "vfmsb %%v24,%%v24,%%v0 \n\t" - "vst %%v24, 64(%%r1,%2) \n\t" - "vl %%v25, 80(%%r1,%2) \n\t" - "vfmsb %%v25,%%v25,%%v0 \n\t" - "vst %%v25, 80(%%r1,%2) \n\t" - "vl %%v26, 96(%%r1,%2) \n\t" - "vfmsb %%v26,%%v26,%%v0 \n\t" - "vst %%v26, 96(%%r1,%2) \n\t" - "vl %%v27, 112(%%r1,%2) \n\t" - "vfmsb %%v27,%%v27,%%v0 \n\t" - "vst %%v27, 112(%%r1,%2) \n\t" - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"m"(da),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v24","v25","v26","v27" - ); +static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { + __asm__("vlrepf %%v0,%[da]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x),[da] "m"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,5 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0,j=0; - if ( n <= 0 || inc_x <=0 ) - return(0); - - - if ( inc_x == 1 ) - { - - if ( da == 0.0 ) - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - - sscal_kernel_32_zero(n1, x); - j=n1; - } - - while(j < n) - { - - x[j]=0.0; - j++; - } - - } - else - { - - BLASLONG n1 = n & -32; - if ( n1 > 0 ) - { - sscal_kernel_32(n1, da, x); - j=n1; - } - while(j < n) - { - - x[j] = da * x[j] ; - j++; - } - } +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + if (n <= 0 || inc_x <= 0) + return (0); + if (inc_x == 1) { + if (da == 0.0) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + sscal_kernel_32_zero(n1, x); + j = n1; + } + + while (j < n) { + + x[j] = 0.0; + j++; + } + + } else { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + sscal_kernel_32(n1, da, x); + j = n1; + } + while (j < n) { + + x[j] = da * x[j]; + j++; + } } - else - { - if ( da == 0.0 ) - { + } else { - BLASLONG n1 = n & -2; + if (da == 0.0) { - while (j < n1) { + BLASLONG n1 = n & -2; - x[i]=0.0; - x[i + inc_x]=0.0; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = 0.0; + x[i + inc_x] = 0.0; - } - while(j < n) - { + i += inc_x * 2; + j += 2; - x[i]=0.0; - i += inc_x ; - j++; - } + } + while (j < n) { - } - else - { - BLASLONG n1 = n & -2; + x[i] = 0.0; + i += inc_x; + j++; + } - while (j < n1) { + } else { + BLASLONG n1 = n & -2; - x[i] = da * x[i] ; - x[i + inc_x] = da * x[i + inc_x]; + while (j < n1) { - i += inc_x * 2; - j += 2; + x[i] = da * x[i]; + x[i + inc_x] = da * x[i + inc_x]; - } + i += inc_x * 2; + j += 2; - while(j < n) - { + } - x[i] = da * x[i] ; - i += inc_x ; - j++; - } - } + while (j < n) { + x[i] = da * x[i]; + i += inc_x; + j++; + } } - return 0; - -} + } + return 0; +} diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index d0c0dc3f4..dc7113143 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,138 +27,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,6 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n])x),"ZR"((FLOAT (*)[n])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, + BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp; - BLASLONG n1 = n & -64; - if ( n1 > 0 ) - { - sswap_kernel_64(n1, x, y); - i=n1; - } + if (n <= 0) + return (0); - while(i < n) - { - temp = y[i]; - y[i] = x[i] ; - x[i] = temp; - i++ ; + if ((inc_x == 1) && (inc_y == 1)) { - } + BLASLONG n1 = n & -64; + if (n1 > 0) { + sswap_kernel_64(n1, x, y); + i = n1; + } + while (i < n) { + temp = y[i]; + y[i] = x[i]; + x[i] = temp; + i++; } - else - { - while(i < n) - { - temp = y[iy]; - y[iy] = x[ix] ; - x[ix] = temp; - ix += inc_x ; - iy += inc_y ; - i++ ; + } else { - } + while (i < n) { + temp = y[iy]; + y[iy] = x[ix]; + x[ix] = temp; + ix += inc_x; + iy += inc_y; + i++; } - return(0); - - -} + } + return (0); +} diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index cc6347127..531e47a0b 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmaxdb %%v16,%%v16,%%v24,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v26,0 \n\t" - "vfmaxdb %%v20,%%v20,%%v28,0 \n\t" - "vfmaxdb %%v22,%%v22,%%v30,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v20,0 \n\t" - "vfmaxdb %%v18,%%v18,%%v22,0 \n\t" - - "vfmaxdb %%v16,%%v16,%%v18,0 \n\t" - - "vfmaxdb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmaxdb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index ae711c173..cac2da938 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amax; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v16,%%v17 \n\t" - "vfchdb %%v25,%%v18,%%v19 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v24,%%v25 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v26,%%v0 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v0,%%v16 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amax) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amax; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amax; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amax; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - maxf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (maxf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (maxf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + maxf = zamax_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { - - maxf=CABS1(x,0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) > maxf) { - maxf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) > maxf) { - maxf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) > maxf) { - maxf = CABS1(x,ix+inc_x2*3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - - while (i < n) { - if (CABS1(x,ix) > maxf) { - maxf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (maxf); + maxf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (maxf); + + } else { + + maxf = CABS1(x, 0); + inc_x2 = 2 * inc_x; + + BLASLONG n1 = n & -4; + while (i < n1) { + + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) > maxf) { + maxf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) > maxf) { + maxf = CABS1(x, ix + inc_x2 * 3); + } + + ix += inc_x2 * 4; + + i += 4; + + } + + while (i < n) { + if (CABS1(x, ix) > maxf) { + maxf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (maxf); + } } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 18610daea..940d81dd2 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,184 +28,165 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vleg %%v24,128(%%r1,%2),0 \n\t" - "vleg %%v25,136(%%r1,%2),0 \n\t" - "vleg %%v24,144(%%r1,%2),1 \n\t" - "vleg %%v25,152(%%r1,%2),1 \n\t" - "vleg %%v26,160(%%r1,%2),0 \n\t" - "vleg %%v27,168(%%r1,%2),0 \n\t" - "vleg %%v26,176(%%r1,%2),1 \n\t" - "vleg %%v27,184(%%r1,%2),1 \n\t" - "vleg %%v28,192(%%r1,%2),0 \n\t" - "vleg %%v29,200(%%r1,%2),0 \n\t" - "vleg %%v28,208(%%r1,%2),1 \n\t" - "vleg %%v29,216(%%r1,%2),1 \n\t" - "vleg %%v30,224(%%r1,%2),0 \n\t" - "vleg %%v31,232(%%r1,%2),0 \n\t" - "vleg %%v30,240(%%r1,%2),1 \n\t" - "vleg %%v31,248(%%r1,%2),1 \n\t" - - "vflpdb %%v16,%%v16 \n\t" - "vflpdb %%v17,%%v17 \n\t" - "vflpdb %%v18,%%v18 \n\t" - "vflpdb %%v19,%%v19 \n\t" - "vflpdb %%v20,%%v20 \n\t" - "vflpdb %%v21,%%v21 \n\t" - "vflpdb %%v22,%%v22 \n\t" - "vflpdb %%v23,%%v23 \n\t" - "vflpdb %%v24,%%v24 \n\t" - "vflpdb %%v25,%%v25 \n\t" - "vflpdb %%v26,%%v26 \n\t" - "vflpdb %%v27,%%v27 \n\t" - "vflpdb %%v28,%%v28 \n\t" - "vflpdb %%v29,%%v29 \n\t" - "vflpdb %%v30,%%v30 \n\t" - "vflpdb %%v31,%%v31 \n\t" - - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v18,%%v18,%%v19 \n\t" - "vfadb %%v20,%%v20,%%v21 \n\t" - "vfadb %%v22,%%v22,%%v23 \n\t" - "vfadb %%v24,%%v24,%%v25 \n\t" - "vfadb %%v26,%%v26,%%v27 \n\t" - "vfadb %%v28,%%v28,%%v29 \n\t" - "vfadb %%v30,%%v30,%%v31 \n\t" - - "vfmindb %%v16,%%v16,%%v24,0 \n\t" - "vfmindb %%v18,%%v18,%%v26,0 \n\t" - "vfmindb %%v20,%%v20,%%v28,0 \n\t" - "vfmindb %%v22,%%v22,%%v30,0 \n\t" - - "vfmindb %%v16,%%v16,%%v20,0 \n\t" - "vfmindb %%v18,%%v18,%%v22,0 \n\t" - - "vfmindb %%v16,%%v16,%%v18,0 \n\t" - - "vfmindb %%v0,%%v0,%%v16,0 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfmindb %%v0,%%v0,%%v16,0 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index f82c57e81..7417e0b74 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,194 +28,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) -#define ABS fabs -#else -#define ABS fabsf -#endif - -#define CABS1(x,i) (ABS(x[i]) + ABS(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT amin; - - __asm__ volatile ( - "vleg %%v0,0(%2),0 \n\t" - "vleg %%v16,8(%2),0 \n\t" - "vleg %%v0,16(%2),1 \n\t" - "vleg %%v16,24(%2),1 \n\t" - "vflpdb %%v0,%%v0 \n\t" - "vflpdb %%v16,%%v16 \n\t" - "vfadb %%v0,%%v0,%%v16 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vleg %%v16,0(%%r1,%2),0 \n\t" - "vleg %%v17,8(%%r1,%2),0 \n\t" - "vleg %%v16,16(%%r1,%2),1 \n\t" - "vleg %%v17,24(%%r1,%2),1 \n\t" - "vleg %%v18,32(%%r1,%2),0 \n\t" - "vleg %%v19,40(%%r1,%2),0 \n\t" - "vleg %%v18,48(%%r1,%2),1 \n\t" - "vleg %%v19,56(%%r1,%2),1 \n\t" - "vleg %%v20,64(%%r1,%2),0 \n\t" - "vleg %%v21,72(%%r1,%2),0 \n\t" - "vleg %%v20,80(%%r1,%2),1 \n\t" - "vleg %%v21,88(%%r1,%2),1 \n\t" - "vleg %%v22,96(%%r1,%2),0 \n\t" - "vleg %%v23,104(%%r1,%2),0 \n\t" - "vleg %%v22,112(%%r1,%2),1 \n\t" - "vleg %%v23,120(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "vleg %%v16,128(%%r1,%2),0 \n\t" - "vleg %%v17,136(%%r1,%2),0 \n\t" - "vleg %%v16,144(%%r1,%2),1 \n\t" - "vleg %%v17,152(%%r1,%2),1 \n\t" - "vleg %%v18,160(%%r1,%2),0 \n\t" - "vleg %%v19,168(%%r1,%2),0 \n\t" - "vleg %%v18,176(%%r1,%2),1 \n\t" - "vleg %%v19,184(%%r1,%2),1 \n\t" - "vleg %%v20,192(%%r1,%2),0 \n\t" - "vleg %%v21,200(%%r1,%2),0 \n\t" - "vleg %%v20,208(%%r1,%2),1 \n\t" - "vleg %%v21,216(%%r1,%2),1 \n\t" - "vleg %%v22,224(%%r1,%2),0 \n\t" - "vleg %%v23,232(%%r1,%2),0 \n\t" - "vleg %%v22,240(%%r1,%2),1 \n\t" - "vleg %%v23,248(%%r1,%2),1 \n\t" - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - "vfadb %%v16,%%v16,%%v17 \n\t" - "vfadb %%v17,%%v18,%%v19 \n\t" - "vfadb %%v18,%%v20,%%v21 \n\t" - "vfadb %%v19,%%v22,%%v23 \n\t" - - "vfchdb %%v24,%%v17,%%v16 \n\t" - "vfchdb %%v25,%%v19,%%v18 \n\t" - "vsel %%v24,%%v16,%%v17,%%v24 \n\t" - "vsel %%v25,%%v18,%%v19,%%v25 \n\t" - - "vfchdb %%v26,%%v25,%%v24 \n\t" - "vsel %%v26,%%v24,%%v25,%%v26 \n\t" - - "vfchdb %%v27,%%v0,%%v26 \n\t" - "vsel %%v0,%%v26,%%v0,%%v27 \n\t" - - "agfi %%r1, 256 \n\t" - "brctg %%r0, 0b \n\t" - - "vrepg %%v16,%%v0,1 \n\t" - "wfchdb %%v17,%%v16,%%v0 \n\t" - "vsel %%v0,%%v0,%%v16,%%v17 \n\t" - "ldr %0,%%f0 " - :"=f"(amin) - :"r"(n),"ZR"((const FLOAT (*)[n])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27" - ); - - return amin; +#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) + +static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT amin; + + __asm__("vleg %%v0,0(%[x]),0\n\t" + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); + + return amin; } - + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } - else - { - minf=CABS1(x,0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += 2; - i++; - } - return (minf); + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT minf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) + return (minf); + + if (inc_x == 1) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + minf = zamin_kernel_16(n1, x); + ix = n1 * 2; + i = n1; } else { + minf = CABS1(x, 0); + ix += 2; + i++; + } + + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += 2; + i++; + } + return (minf); - minf=CABS1(x,0); - inc_x2 = 2 * inc_x; + } else { - BLASLONG n1 = n & -4; - while (i < n1) { + minf = CABS1(x, 0); + inc_x2 = 2 * inc_x; - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - if (CABS1(x,ix+inc_x2) < minf) { - minf = CABS1(x,ix+inc_x2); - } - if (CABS1(x,ix+inc_x2*2) < minf) { - minf = CABS1(x,ix+inc_x2*2); - } - if (CABS1(x,ix+inc_x2*3) < minf) { - minf = CABS1(x,ix+inc_x2*3); - } + BLASLONG n1 = n & -4; + while (i < n1) { - ix += inc_x2 * 4; + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + if (CABS1(x, ix + inc_x2) < minf) { + minf = CABS1(x, ix + inc_x2); + } + if (CABS1(x, ix + inc_x2 * 2) < minf) { + minf = CABS1(x, ix + inc_x2 * 2); + } + if (CABS1(x, ix + inc_x2 * 3) < minf) { + minf = CABS1(x, ix + inc_x2 * 3); + } - i += 4; + ix += inc_x2 * 4; - } + i += 4; + } - while (i < n) { - if (CABS1(x,ix) < minf) { - minf = CABS1(x,ix); - } - ix += inc_x2; - i++; - } - return (minf); + while (i < n) { + if (CABS1(x, ix) < minf) { + minf = CABS1(x, ix); + } + ix += inc_x2; + i++; } + return (minf); + } } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 8faaf20eb..43ae8ff8b 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -28,138 +28,126 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #include -#if defined(DOUBLE) #define ABS fabs -#else -#define ABS fabsf -#endif -static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) -{ - FLOAT asum; - - __asm__ ( - "vzero %%v0 \n\t" - "vzero %%v1 \n\t" - "vzero %%v2 \n\t" - "vzero %%v3 \n\t" - "srlg %%r0,%1,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - "vl %%v20, 64(%%r1,%2) \n\t" - "vl %%v21, 80(%%r1,%2) \n\t" - "vl %%v22, 96(%%r1,%2) \n\t" - "vl %%v23, 112(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - "vl %%v20, 192(%%r1,%2) \n\t" - "vl %%v21, 208(%%r1,%2) \n\t" - "vl %%v22, 224(%%r1,%2) \n\t" - "vl %%v23, 240(%%r1,%2) \n\t" - - "vflpdb %%v16, %%v16 \n\t" - "vflpdb %%v17, %%v17 \n\t" - "vflpdb %%v18, %%v18 \n\t" - "vflpdb %%v19, %%v19 \n\t" - "vflpdb %%v20, %%v20 \n\t" - "vflpdb %%v21, %%v21 \n\t" - "vflpdb %%v22, %%v22 \n\t" - "vflpdb %%v23, %%v23 \n\t" - - "vfadb %%v0,%%v0,%%v16 \n\t" - "vfadb %%v1,%%v1,%%v17 \n\t" - "vfadb %%v2,%%v2,%%v18 \n\t" - "vfadb %%v3,%%v3,%%v19 \n\t" - "vfadb %%v0,%%v0,%%v20 \n\t" - "vfadb %%v1,%%v1,%%v21 \n\t" - "vfadb %%v2,%%v2,%%v22 \n\t" - "vfadb %%v3,%%v3,%%v23 \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v0,%%v0,%%v1 \n\t" - "vfadb %%v0,%%v0,%%v2 \n\t" - "vfadb %%v0,%%v0,%%v3 \n\t" - "vrepg %%v1,%%v0,1 \n\t" - "adbr %%f0,%%f1 \n\t" - "ldr %0,%%f0 " - :"=f"(asum) - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23" - ); - - return asum; +static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { + FLOAT asum; + + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=m"(asum),[n] "+&r"(n) + : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + + return asum; } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - BLASLONG ip=0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG ip = 0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if (n <= 0 || inc_x <= 0) return(sumf); + if (n <= 0 || inc_x <= 0) + return (sumf); - if ( inc_x == 1 ) - { + if (inc_x == 1) { - n1 = n & -16; - if ( n1 > 0 ) - { + n1 = n & -16; + if (n1 > 0) { - sumf = zasum_kernel_16(n1, x); - i=n1; - ip=2*n1; - } - - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - i++; - ip+=2; - } + sumf = zasum_kernel_16(n1, x); + i = n1; + ip = 2 * n1; + } + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + i++; + ip += 2; } - else - { - inc_x2 = 2* inc_x; - while(i < n) - { - sumf += ABS(x[ip]) + ABS(x[ip+1]); - ip+=inc_x2; - i++; - } + } else { + inc_x2 = 2 * inc_x; + while (i < n) { + sumf += ABS(x[ip]) + ABS(x[ip + 1]); + ip += inc_x2; + i++; } - return(sumf); -} - + } + return (sumf); +} diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index f0e993d2f..31549849d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,144 +27,136 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - __asm__ volatile( +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { + __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%3) \n\t" - "vleg %%v1,8(%3),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%3),1 \n\t" -#else - "vleg %%v0,0(%3),1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,0(%3),0 \n\t" - "vlrepg %%v1,8(%3) \n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" +#else + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "vl %%v16,64(%%r1,%1) \n\t" - "vl %%v17,80(%%r1,%1) \n\t" - "vl %%v18,96(%%r1,%1) \n\t" - "vl %%v19,112(%%r1,%1) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,64(%%r1,%2) \n\t" - "vst %%v29,80(%%r1,%2) \n\t" - "vst %%v30,96(%%r1,%2) \n\t" - "vst %%v31,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"ZQ"((const FLOAT (*)[2])alpha) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) return (0); + if (n <= 0) + return (0); - if ((inc_x == 1) && (inc_y == 1)) { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -8; + BLASLONG n1 = n & -8; - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + if (n1) { + da[0] = da_r; + da[1] = da_i; + zaxpy_kernel_8(n1, x, y, da); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; - - } - return (0); - + i++; + ix += 2; } + return (0); - inc_x *= 2; - inc_y *= 2; + } - while (i < n) { + inc_x *= 2; + inc_y *= 2; + + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } - - diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 8c940bba3..2f80cedce 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,73 +27,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "lgr %%r1,%1 \n\t" - "lgr %%r2,%2 \n\t" - "srlg %%r0,%0,4 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1) \n\t" - "pfd 2, 1024(%%r2) \n\t" - "mvc 0(256,%%r2),0(%%r1) \n\t" - "agfi %%r1,256 \n\t" - "agfi %%r2,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"a"((const FLOAT (*)[n * 2])x),"a"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","r2" - ); +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "m"(*(const FLOAT (*)[n * 2]) x) + : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ( n <= 0 ) return(0); + if (n <= 0) + return (0); - if ( (inc_x == 1) && (inc_y == 1 )) - { + if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zcopy_kernel_16(n1, x, y); - i=n1; - ix=n1*2; - iy=n1*2; - } - - while(i < n) - { - y[iy] = x[iy] ; - y[iy+1] = x[ix+1] ; - ix+=2; - iy+=2; - i++ ; - - } + BLASLONG n1 = n & -16; + if (n1 > 0) { + zcopy_kernel_16(n1, x, y); + i = n1; + ix = n1 * 2; + iy = n1 * 2; + } + while (i < n) { + y[iy] = x[iy]; + y[iy + 1] = x[ix + 1]; + ix += 2; + iy += 2; + i++; } - else - { - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + } else { - while(i < n) - { - y[iy] = x[ix] ; - y[iy+1] = x[ix+1] ; - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - } + while (i < n) { + y[iy] = x[ix]; + y[iy + 1] = x[ix + 1]; + ix += inc_x2; + iy += inc_y2; + i++; } - - return(0); + + } + + return (0); } diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index aab18e2e9..7a67ef734 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,152 +27,146 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "vzero %%v28 \n\t" - "vzero %%v29 \n\t" - "vzero %%v30 \n\t" - "vzero %%v31 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 1, 1024(%%r1,%1) \n\t" - "pfd 1, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "vl %%v16, 64(%%r1,%1) \n\t" - "vl %%v17, 80(%%r1,%1) \n\t" - "vl %%v18, 96(%%r1,%1) \n\t" - "vl %%v19, 112(%%r1,%1) \n\t" - "vl %%v0, 64(%%r1,%2) \n\t" - "vl %%v1, 80(%%r1,%2) \n\t" - "vl %%v2, 96(%%r1,%2) \n\t" - "vl %%v3, 112(%%r1,%2) \n\t" - "vpdi %%v20,%%v16,%%v16,4 \n\t" - "vpdi %%v21,%%v17,%%v17,4 \n\t" - "vpdi %%v22,%%v18,%%v18,4 \n\t" - "vpdi %%v23,%%v19,%%v19,4 \n\t" - - "vfmadb %%v24,%%v16,%%v0,%%v24 \n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25 \n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26 \n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27 \n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28 \n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29 \n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30 \n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31 \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b \n\t" - "vfadb %%v24,%%v24,%%v26 \n\t" - "vfadb %%v24,%%v24,%%v28 \n\t" - "vfadb %%v24,%%v24,%%v30 \n\t" - "vfadb %%v25,%%v25,%%v27 \n\t" - "vfadb %%v25,%%v25,%%v29 \n\t" - "vfadb %%v25,%%v25,%%v31 \n\t" - "vsteg %%v24,0(%3),0 \n\t" - "vsteg %%v24,8(%3),1 \n\t" - "vsteg %%v25,16(%3),1 \n\t" - "vsteg %%v25,24(%3),0 " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])x),"ZR"((const FLOAT (*)[n * 2])y),"ZQ"((FLOAT (*)[4])d) - :"memory","cc","r0","r1","v0","v1","v2","v3","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + __asm__("vzero %%v24\n\t" + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), + "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y) { + BLASLONG i; + BLASLONG ix, iy; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = { + 0.0, 0.0, 0.0, 0.0}; + + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); - if ((inc_x == 1) && (inc_y == 1)) { + } - BLASLONG n1 = n & -8; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) - zdot_kernel_8(n1, x, y, dot); + BLASLONG n1 = n & -8; - i = n1; - BLASLONG j = i * 2; + if (n1) + zdot_kernel_8(n1, x, y, dot); - while (i < n) { + i = n1; + BLASLONG j = i * 2; - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; + while (i < n) { - j += 2; - i++; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - } + j += 2; + i++; + } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } } + } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } - - diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 9472b5d5a..7f21985ec 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project +Copyright (c) 2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,691 +25,632 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include #include "common.h" #define NBMAX 1024 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%5) \n\t" - "vl %%v17,16(%5) \n\t" - "vl %%v18,32(%5) \n\t" - "vl %%v19,48(%5) \n\t" +static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%5),0 \n\t" - "wflcdb %%v20,%%v20 \n\t" - "vleg %%v20,0(%5),1 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "wflcdb %%v21,%%v21 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "wflcdb %%v22,%%v22 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vleg %%v23,56(%5),0 \n\t" - "wflcdb %%v23,%%v23 \n\t" - "vleg %%v23,48(%5),1 \n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%5),1 \n\t" - "vflcdb %%v20,%%v20 \n\t" - "vleg %%v20,8(%5),0 \n\t" - "vleg %%v21,16(%5),1 \n\t" - "vflcdb %%v21,%%v21 \n\t" - "vleg %%v21,24(%5),0 \n\t" - "vleg %%v22,32(%5),1 \n\t" - "vflcdb %%v22,%%v22 \n\t" - "vleg %%v22,40(%5),0 \n\t" - "vleg %%v23,48(%5),1 \n\t" - "vflcdb %%v23,%%v23 \n\t" - "vleg %%v23,56(%5),0 \n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 1,1024(%%r1,%3) \n\t" - "pfd 1,1024(%%r1,%4) \n\t" - "pfd 2,1024(%%r1,%6) \n\t" - - "vlrepg %%v24,0(%%r1,%1) \n\t" - "vlrepg %%v25,8(%%r1,%1) \n\t" - "vlrepg %%v26,0(%%r1,%2) \n\t" - "vlrepg %%v27,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,0(%%r1,%3) \n\t" - "vlrepg %%v29,8(%%r1,%3) \n\t" - "vlrepg %%v30,0(%%r1,%4) \n\t" - "vlrepg %%v31,8(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,0(%%r1,%6) \n\t" - - "vlrepg %%v24,16(%%r1,%1) \n\t" - "vlrepg %%v25,24(%%r1,%1) \n\t" - "vlrepg %%v26,16(%%r1,%2) \n\t" - "vlrepg %%v27,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%6) \n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0 \n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0 \n\t" - - "vlrepg %%v28,16(%%r1,%3) \n\t" - "vlrepg %%v29,24(%%r1,%3) \n\t" - "vlrepg %%v30,16(%%r1,%4) \n\t" - "vlrepg %%v31,24(%%r1,%4) \n\t" - - "vfmadb %%v0,%%v28,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v29,%%v22,%%v0 \n\t" - "vfmadb %%v0,%%v30,%%v19,%%v0 \n\t" - "vfmadb %%v0,%%v31,%%v23,%%v0 \n\t" - "vst %%v0,16(%%r1,%6) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZR"((const FLOAT (*)[n * 2])ap[2]),"ZR"((const FLOAT (*)[n * 2])ap[3]),"ZQ"((const FLOAT (*)[8])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), + "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), + "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%3) \n\t" - "vl %%v17,16(%3) \n\t" +static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%3),0 \n\t" - "wflcdb %%v18,%%v18 \n\t" - "vleg %%v18,0(%3),1 \n\t" - "vleg %%v19,24(%3),0 \n\t" - "wflcdb %%v19,%%v19 \n\t" - "vleg %%v19,16(%3),1 \n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%3),1 \n\t" - "vflcdb %%v18,%%v18 \n\t" - "vleg %%v18,8(%3),0 \n\t" - "vleg %%v19,16(%3),1 \n\t" - "vflcdb %%v19,%%v19 \n\t" - "vleg %%v19,24(%3),0 \n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 1,1024(%%r1,%2) \n\t" - "pfd 2,1024(%%r1,%4) \n\t" - - "vlrepg %%v20,0(%%r1,%1) \n\t" - "vlrepg %%v21,8(%%r1,%1) \n\t" - "vlrepg %%v22,0(%%r1,%2) \n\t" - "vlrepg %%v23,8(%%r1,%2) \n\t" - - "vl %%v0,0(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,0(%%r1,%4) \n\t" - - "vlrepg %%v20,16(%%r1,%1) \n\t" - "vlrepg %%v21,24(%%r1,%1) \n\t" - "vlrepg %%v22,16(%%r1,%2) \n\t" - "vlrepg %%v23,24(%%r1,%2) \n\t" - - "vl %%v0,16(%%r1,%4) \n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0 \n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0 \n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0 \n\t" - "vst %%v0,16(%%r1,%4) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap[0]),"ZR"((const FLOAT (*)[n * 2])ap[1]),"ZQ"((const FLOAT (*)[4])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), + "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), + "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) -{ - __asm__ volatile ( - "vl %%v16,0(%2) \n\t" +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%2),0 \n\t" - "wflcdb %%v17,%%v17 \n\t" - "vleg %%v17,0(%2),1 \n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%2),1 \n\t" - "vflcdb %%v17,%%v17 \n\t" - "vleg %%v17,8(%2),0 \n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,1 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%3) \n\t" - - "vlrepg %%v18,0(%%r1,%1) \n\t" - "vlrepg %%v19,8(%%r1,%1) \n\t" - - "vl %%v0,0(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,0(%%r1,%3) \n\t" - - "vlrepg %%v18,16(%%r1,%1) \n\t" - "vlrepg %%v19,24(%%r1,%1) \n\t" - - "vl %%v0,16(%%r1,%3) \n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0 \n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0 \n\t" - "vst %%v0,16(%%r1,%3) \n\t" - - "agfi %%r1,32 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])ap),"ZQ"((const FLOAT (*)[2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), + "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) -{ - __asm__ volatile ( -#if !defined(XCONJ) - "vlrepg %%v0,%3 \n\t" - "vleg %%v1,%4,0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,%4,1 \n\t" +static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, + FLOAT alpha_i) { + __asm__( +#if !defined(XCONJ) + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%3,1 \n\t" - "vflcdb %%v0,%%v0 \n\t" - "vleg %%v0,%3,0 \n\t" - "vlrepg %%v1,%4 \n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1 \n\t" - "srlg %%r0,%0,2 \n\t" - "0: \n\t" - "pfd 1,1024(%%r1,%1) \n\t" - "pfd 2,1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%1) \n\t" - "vl %%v17,16(%%r1,%1) \n\t" - "vl %%v18,32(%%r1,%1) \n\t" - "vl %%v19,48(%%r1,%1) \n\t" - "vl %%v20,0(%%r1,%2) \n\t" - "vl %%v21,16(%%r1,%2) \n\t" - "vl %%v22,32(%%r1,%2) \n\t" - "vl %%v23,48(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - - "vfmadb %%v28,%%v16,%%v0,%%v20 \n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21 \n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22 \n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23 \n\t" - - "vfmadb %%v28,%%v24,%%v1,%%v28 \n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29 \n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30 \n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31 \n\t" - - "vst %%v28,0(%%r1,%2) \n\t" - "vst %%v29,16(%%r1,%2) \n\t" - "vst %%v30,32(%%r1,%2) \n\t" - "vst %%v31,48(%%r1,%2) \n\t" - - "agfi %%r1,64 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((const FLOAT (*)[n * 2])src),"ZR"((FLOAT (*)[n * 2])dest),"m"(alpha_r),"m"(alpha_i) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), + [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) -{ - BLASLONG i; +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, + FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if ( inc_dest != 2 ) - { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i> 2 ; - n2 = n & 3 ; - - m3 = m & 3 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if ( inc_x != 2 ) - copy_x(NB,x_ptr,xbuffer,inc_x); - else - xbuffer = x_ptr; - - if ( inc_y == 2 ) - { - - for( i = 0; i < n1 ; i++) - { - zgemv_kernel_4x4(NB,ap,xbuffer,y_ptr,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if ( n2 & 2 ) - { - zgemv_kernel_4x2(NB,ap,xbuffer,y_ptr,alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if ( n2 & 1 ) - { - zgemv_kernel_4x1(NB,a_ptr,xbuffer,y_ptr,alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } - else - { - - for( i = 0; i < n1 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for( i = 0; i < n2 ; i++) - { - memset(ybuffer,0,sizeof(ybuffer)); - zgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - - - if ( m3 == 0 ) return(0); - - x_ptr = x; - j=0; - a_ptr = a; - y_ptr = y; - - if ( m3 == 3 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while ( j < n) - { +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, + FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, + BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + FLOAT *ap[8]; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + BLASLONG lda4; + FLOAT ybuffer[8], *xbuffer; + FLOAT alpha[2]; + + if (m < 1) + return (0); + if (n < 1) + return (0); + + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; + lda4 = lda << 2; + + xbuffer = buffer; + + n1 = n >> 2; + n2 = n & 3; + + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; + + alpha[0] = alpha_r; + alpha[1] = alpha_i; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) + break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); + a_ptr += lda * 2; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); + /* a_ptr += lda; + y_ptr += 2; */ + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof(ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; + } + + if (m3 == 0) + return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; + + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } - - - if ( m3 == 2 ) - { - - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while ( j < ( n & -2 )) - { + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } - + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - if ( m3 == 1 ) - { + if (m3 == 1) { - FLOAT temp_r ; - FLOAT temp_i ; - FLOAT temp_r1 ; - FLOAT temp_i1 ; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + FLOAT ar = alpha[0]; + FLOAT ai = alpha[1]; - while ( j < ( n & -2 )) - { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 - ai * temp_i1; + y_ptr[1] += ar * temp_i1 + ai * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr += inc_y; + y_ptr[0] += ar * temp_r1 + ai * temp_i1; + y_ptr[1] -= ar * temp_i1 - ai * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j+=2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while ( j < n) - { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += ar * temp_r - ai * temp_i; + y_ptr[1] += ar * temp_i + ai * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += ar * temp_r + ai * temp_i; + y_ptr[1] -= ar * temp_i - ai * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return(0); - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); + } - return(0); + return (0); } diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 75027a06c..aa7f16605 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2018, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,230 +27,209 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) -{ - __asm__ ( - "vlrepg %%v0,%3 \n\t" - "vlrepg %%v1,%4 \n\t" - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - "vl %%v24, 0(%%r1,%1) \n\t" - "vl %%v25, 16(%%r1,%1) \n\t" - "vl %%v26, 32(%%r1,%1) \n\t" - "vl %%v27, 48(%%r1,%1) \n\t" - "vl %%v16, 0(%%r1,%2) \n\t" - "vl %%v17, 16(%%r1,%2) \n\t" - "vl %%v18, 32(%%r1,%2) \n\t" - "vl %%v19, 48(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 0(%%r1,%1) \n\t" - "vst %%v29, 16(%%r1,%1) \n\t" - "vst %%v30, 32(%%r1,%1) \n\t" - "vst %%v31, 48(%%r1,%1) \n\t" - "vst %%v20, 0(%%r1,%2) \n\t" - "vst %%v21, 16(%%r1,%2) \n\t" - "vst %%v22, 32(%%r1,%2) \n\t" - "vst %%v23, 48(%%r1,%2) \n\t" - - "vl %%v24, 64(%%r1,%1) \n\t" - "vl %%v25, 80(%%r1,%1) \n\t" - "vl %%v26, 96(%%r1,%1) \n\t" - "vl %%v27, 112(%%r1,%1) \n\t" - "vl %%v16, 64(%%r1,%2) \n\t" - "vl %%v17, 80(%%r1,%2) \n\t" - "vl %%v18, 96(%%r1,%2) \n\t" - "vl %%v19, 112(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 64(%%r1,%1) \n\t" - "vst %%v29, 80(%%r1,%1) \n\t" - "vst %%v30, 96(%%r1,%1) \n\t" - "vst %%v31, 112(%%r1,%1) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v16, 128(%%r1,%2) \n\t" - "vl %%v17, 144(%%r1,%2) \n\t" - "vl %%v18, 160(%%r1,%2) \n\t" - "vl %%v19, 176(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 128(%%r1,%1) \n\t" - "vst %%v29, 144(%%r1,%1) \n\t" - "vst %%v30, 160(%%r1,%1) \n\t" - "vst %%v31, 176(%%r1,%1) \n\t" - "vst %%v20, 128(%%r1,%2) \n\t" - "vst %%v21, 144(%%r1,%2) \n\t" - "vst %%v22, 160(%%r1,%2) \n\t" - "vst %%v23, 176(%%r1,%2) \n\t" - - "vl %%v24, 192(%%r1,%1) \n\t" - "vl %%v25, 208(%%r1,%1) \n\t" - "vl %%v26, 224(%%r1,%1) \n\t" - "vl %%v27, 240(%%r1,%1) \n\t" - "vl %%v16, 192(%%r1,%2) \n\t" - "vl %%v17, 208(%%r1,%2) \n\t" - "vl %%v18, 224(%%r1,%2) \n\t" - "vl %%v19, 240(%%r1,%2) \n\t" - - "vfmdb %%v28,%%v24,%%v0 \n\t" - "vfmdb %%v29,%%v25,%%v0 \n\t" - "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0 \n\t" - "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0 \n\t" - "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ - /* 2nd parts*/ - "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ - - "vst %%v28, 192(%%r1,%1) \n\t" - "vst %%v29, 208(%%r1,%1) \n\t" - "vst %%v30, 224(%%r1,%1) \n\t" - "vst %%v31, 240(%%r1,%1) \n\t" - "vst %%v20, 192(%%r1,%2) \n\t" - "vst %%v21, 208(%%r1,%2) \n\t" - "vst %%v22, 224(%%r1,%2) \n\t" - "vst %%v23, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y),"m"(*c),"m"(*s) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { + __asm__("vlrepg %%v0,%[c]\n\t" + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1) ) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - FLOAT cosa,sina; - cosa=c; - sina=s; - zrot_kernel_16(n1, x, y, &cosa, &sina); - i=n1; - ix=2*n1; - } - - while(i < n) - { - temp[0] = c*x[ix] + s*y[ix] ; - temp[1] = c*x[ix+1] + s*y[ix+1] ; - y[ix] = c*y[ix] - s*x[ix] ; - y[ix+1] = c*y[ix+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += 2 ; - i++ ; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT c, FLOAT s) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + FLOAT cosa, sina; + cosa = c; + sina = s; + zrot_kernel_16(n1, x, y, &cosa, &sina); + i = n1; + ix = 2 * n1; + } - } + while (i < n) { + temp[0] = c * x[ix] + s * y[ix]; + temp[1] = c * x[ix + 1] + s * y[ix + 1]; + y[ix] = c * y[ix] - s * x[ix]; + y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + ix += 2; + i++; } - else - { - inc_x2 = 2 * inc_x ; - inc_y2 = 2 * inc_y ; - while(i < n) - { - temp[0] = c*x[ix] + s*y[iy] ; - temp[1] = c*x[ix+1] + s*y[iy+1] ; - y[iy] = c*y[iy] - s*x[ix] ; - y[iy+1] = c*y[iy+1] - s*x[ix+1] ; - x[ix] = temp[0] ; - x[ix+1] = temp[1] ; - - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; - } + } else { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + while (i < n) { + temp[0] = c * x[ix] + s * y[iy]; + temp[1] = c * x[ix + 1] + s * y[iy + 1]; + y[iy] = c * y[iy] - s * x[ix]; + y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; + x[ix] = temp[0]; + x[ix + 1] = temp[1]; + + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - -} + } + return (0); +} diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 4d8ee960f..fbcc0c5b9 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013 - 2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,426 +27,396 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "vleg %%v1,8(%1),0 \n\t" - "wflcdb %%v1,%%v1 \n\t" - "vleg %%v1,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v24,%%v16,%%v16,4 \n\t" - "vpdi %%v25,%%v17,%%v17,4 \n\t" - "vpdi %%v26,%%v18,%%v18,4 \n\t" - "vpdi %%v27,%%v19,%%v19,4 \n\t" - "vpdi %%v28,%%v20,%%v20,4 \n\t" - "vpdi %%v29,%%v21,%%v21,4 \n\t" - "vpdi %%v30,%%v22,%%v22,4 \n\t" - "vpdi %%v31,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16 \n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17 \n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18 \n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19 \n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20 \n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21 \n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22 \n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v1","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); -} - -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vleg %%v0,8(%1),0 \n\t" - "wflcdb %%v0,%%v0 \n\t" - "vleg %%v0,8(%1),1 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - "vpdi %%v16,%%v16,%%v16,4 \n\t" - "vpdi %%v17,%%v17,%%v17,4 \n\t" - "vpdi %%v18,%%v18,%%v18,4 \n\t" - "vpdi %%v19,%%v19,%%v19,4 \n\t" - "vpdi %%v20,%%v20,%%v20,4 \n\t" - "vpdi %%v21,%%v21,%%v21,4 \n\t" - "vpdi %%v22,%%v22,%%v22,4 \n\t" - "vpdi %%v23,%%v23,%%v23,4 \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) -{ - __asm__ volatile( - "vlrepg %%v0,0(%1) \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16,0(%%r1,%2) \n\t" - "vl %%v17,16(%%r1,%2) \n\t" - "vl %%v18,32(%%r1,%2) \n\t" - "vl %%v19,48(%%r1,%2) \n\t" - "vl %%v20,64(%%r1,%2) \n\t" - "vl %%v21,80(%%r1,%2) \n\t" - "vl %%v22,96(%%r1,%2) \n\t" - "vl %%v23,112(%%r1,%2) \n\t" - - "vfmdb %%v16,%%v16,%%v0 \n\t" - "vfmdb %%v17,%%v17,%%v0 \n\t" - "vfmdb %%v18,%%v18,%%v0 \n\t" - "vfmdb %%v19,%%v19,%%v0 \n\t" - "vfmdb %%v20,%%v20,%%v0 \n\t" - "vfmdb %%v21,%%v21,%%v0 \n\t" - "vfmdb %%v22,%%v22,%%v0 \n\t" - "vfmdb %%v23,%%v23,%%v0 \n\t" - - "vst %%v16,0(%%r1,%2) \n\t" - "vst %%v17,16(%%r1,%2) \n\t" - "vst %%v18,32(%%r1,%2) \n\t" - "vst %%v19,48(%%r1,%2) \n\t" - "vst %%v20,64(%%r1,%2) \n\t" - "vst %%v21,80(%%r1,%2) \n\t" - "vst %%v22,96(%%r1,%2) \n\t" - "vst %%v23,112(%%r1,%2) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZQ"((const FLOAT (*)[2])alpha),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v0","v16","v17","v18","v19","v20","v21","v22","v23" - ); +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vleg %%v0,8(%[alpha]),0\n\t" + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) -{ - __asm__ volatile( - "vzero %%v24 \n\t" - "vzero %%v25 \n\t" - "vzero %%v26 \n\t" - "vzero %%v27 \n\t" - "srlg %%r0,%0,3 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - - "vst %%v24,0(%%r1,%1) \n\t" - "vst %%v25,16(%%r1,%1) \n\t" - "vst %%v26,32(%%r1,%1) \n\t" - "vst %%v27,48(%%r1,%1) \n\t" - "vst %%v24,64(%%r1,%1) \n\t" - "vst %%v25,80(%%r1,%1) \n\t" - "vst %%v26,96(%%r1,%1) \n\t" - "vst %%v27,112(%%r1,%1) \n\t" - - "agfi %%r1,128 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x) - :"memory","cc","r0","r1","v24","v25","v26","v27" - ); +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { + __asm__("vlrepg %%v0,0(%[alpha])\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) - { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + __asm__("vzero %%v0\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { +static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, + BLASLONG inc_x) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; + + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; + + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; + + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; + + x += 4 * inc_x; + } +} - while (j < n1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, + FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, + BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; + FLOAT alpha[2] __attribute__ ((aligned(16))); - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + if (inc_x != 1) { + inc_x <<= 1; - } + if (da_r == 0.0) { - while (j < n) { + BLASLONG n1 = n & -2; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + if (da_i == 0.0) { - } + while (j < n1) { - } else { + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; + } - } + while (j < n) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; + } - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + } - while (j < n1) { + } else { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - } + while (j < n1) { - while (j < n) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } - } + while (j < n) { - } else { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + } else { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + BLASLONG n1 = n & -8; + if (n1 > 0) { + alpha[0] = da_r; + alpha[1] = da_i; + zscal_kernel_inc_8(n1, alpha, x, inc_x); + j = n1; + i = n1 * inc_x; + } - } + while (j < n) { - } + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; } - return (0); - } + } + } - BLASLONG n1 = n & -8; - if (n1 > 0) { + return (0); + } - alpha[0] = da_r; - alpha[1] = da_i; + BLASLONG n1 = n & -8; + if (n1 > 0) { - if (da_r == 0.0) - if (da_i == 0) - zscal_kernel_8_zero(n1, x); - else - zscal_kernel_8_zero_r(n1, alpha, x); - else - if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); - else - zscal_kernel_8(n1, alpha, x); + alpha[0] = da_r; + alpha[1] = da_i; - i = n1 << 1; - j = n1; - } + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, alpha, x); + else if (da_i == 0) + zscal_kernel_8_zero_i(n1, alpha, x); + else + zscal_kernel_8(n1, alpha, x); + i = n1 << 1; + j = n1; + } - if (da_r == 0.0) { + if (da_r == 0.0) { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } + } - } + } - } else { + } else { - if (da_i == 0.0) { + if (da_i == 0.0) { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } - } else { + } else { - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; - } - - } + } } - return (0); + } + + return (0); } diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index a16b87cdc..0f38103be 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2017, The OpenBLAS Project +Copyright (c) 2013-2019, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,157 +27,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) -{ - __asm__ volatile( - "srlg %%r0,%0,4 \n\t" - "xgr %%r1,%%r1 \n\t" - "0: \n\t" - "pfd 2, 1024(%%r1,%1) \n\t" - "pfd 2, 1024(%%r1,%2) \n\t" - - "vl %%v16, 0(%%r1,%1) \n\t" - "vl %%v17, 16(%%r1,%1) \n\t" - "vl %%v18, 32(%%r1,%1) \n\t" - "vl %%v19, 48(%%r1,%1) \n\t" - "vl %%v20, 64(%%r1,%1) \n\t" - "vl %%v21, 80(%%r1,%1) \n\t" - "vl %%v22, 96(%%r1,%1) \n\t" - "vl %%v23, 112(%%r1,%1) \n\t" - "vl %%v24, 128(%%r1,%1) \n\t" - "vl %%v25, 144(%%r1,%1) \n\t" - "vl %%v26, 160(%%r1,%1) \n\t" - "vl %%v27, 176(%%r1,%1) \n\t" - "vl %%v28, 192(%%r1,%1) \n\t" - "vl %%v29, 208(%%r1,%1) \n\t" - "vl %%v30, 224(%%r1,%1) \n\t" - "vl %%v31, 240(%%r1,%1) \n\t" - - "vl %%v0, 0(%%r1,%2) \n\t" - "vl %%v1, 16(%%r1,%2) \n\t" - "vl %%v2, 32(%%r1,%2) \n\t" - "vl %%v3, 48(%%r1,%2) \n\t" - "vl %%v4, 64(%%r1,%2) \n\t" - "vl %%v5, 80(%%r1,%2) \n\t" - "vl %%v6, 96(%%r1,%2) \n\t" - "vl %%v7, 112(%%r1,%2) \n\t" - "vst %%v0, 0(%%r1,%1) \n\t" - "vst %%v1, 16(%%r1,%1) \n\t" - "vst %%v2, 32(%%r1,%1) \n\t" - "vst %%v3, 48(%%r1,%1) \n\t" - "vst %%v4, 64(%%r1,%1) \n\t" - "vst %%v5, 80(%%r1,%1) \n\t" - "vst %%v6, 96(%%r1,%1) \n\t" - "vst %%v7, 112(%%r1,%1) \n\t" - - "vl %%v0, 128(%%r1,%2) \n\t" - "vl %%v1, 144(%%r1,%2) \n\t" - "vl %%v2, 160(%%r1,%2) \n\t" - "vl %%v3, 176(%%r1,%2) \n\t" - "vl %%v4, 192(%%r1,%2) \n\t" - "vl %%v5, 208(%%r1,%2) \n\t" - "vl %%v6, 224(%%r1,%2) \n\t" - "vl %%v7, 240(%%r1,%2) \n\t" - "vst %%v0, 128(%%r1,%1) \n\t" - "vst %%v1, 144(%%r1,%1) \n\t" - "vst %%v2, 160(%%r1,%1) \n\t" - "vst %%v3, 176(%%r1,%1) \n\t" - "vst %%v4, 192(%%r1,%1) \n\t" - "vst %%v5, 208(%%r1,%1) \n\t" - "vst %%v6, 224(%%r1,%1) \n\t" - "vst %%v7, 240(%%r1,%1) \n\t" - - "vst %%v16, 0(%%r1,%2) \n\t" - "vst %%v17, 16(%%r1,%2) \n\t" - "vst %%v18, 32(%%r1,%2) \n\t" - "vst %%v19, 48(%%r1,%2) \n\t" - "vst %%v20, 64(%%r1,%2) \n\t" - "vst %%v21, 80(%%r1,%2) \n\t" - "vst %%v22, 96(%%r1,%2) \n\t" - "vst %%v23, 112(%%r1,%2) \n\t" - "vst %%v24, 128(%%r1,%2) \n\t" - "vst %%v25, 144(%%r1,%2) \n\t" - "vst %%v26, 160(%%r1,%2) \n\t" - "vst %%v27, 176(%%r1,%2) \n\t" - "vst %%v28, 192(%%r1,%2) \n\t" - "vst %%v29, 208(%%r1,%2) \n\t" - "vst %%v30, 224(%%r1,%2) \n\t" - "vst %%v31, 240(%%r1,%2) \n\t" - - "agfi %%r1,256 \n\t" - "brctg %%r0,0b " - : - :"r"(n),"ZR"((FLOAT (*)[n * 2])x),"ZR"((FLOAT (*)[n * 2])y) - :"memory","cc","r0","r1","v0","v1","v2","v3","v4","v5","v6","v7","v16","v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - ); +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + __asm__("srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) -{ - BLASLONG i=0; - BLASLONG ix=0,iy=0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if ( n <= 0 ) return(0); - - if ( (inc_x == 1) && (inc_y == 1 )) - { - - BLASLONG n1 = n & -16; - if ( n1 > 0 ) - { - zswap_kernel_16(n1, x, y); - i=n1; - ix = 2* n1; - iy = 2* n1; - } - - while(i < n) - { - - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; - - ix += 2 ; - iy += 2 ; - i++ ; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, + FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, + FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if (n <= 0) + return (0); + + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -16; + if (n1 > 0) { + zswap_kernel_16(n1, x, y); + i = n1; + ix = 2 * n1; + iy = 2 * n1; + } + while (i < n) { - } + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; + ix += 2; + iy += 2; + i++; } - else - { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + } else { - while(i < n) - { + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; - temp[0] = x[ix] ; - temp[1] = x[ix+1] ; - x[ix] = y[iy] ; - x[ix+1] = y[iy+1] ; - y[iy] = temp[0] ; - y[iy+1] = temp[1] ; + while (i < n) { - ix += inc_x2 ; - iy += inc_y2 ; - i++ ; + temp[0] = x[ix]; + temp[1] = x[ix + 1]; + x[ix] = y[iy]; + x[ix + 1] = y[iy + 1]; + y[iy] = temp[0]; + y[iy + 1] = temp[1]; - } + ix += inc_x2; + iy += inc_y2; + i++; } - return(0); - - -} + } + return (0); +} From 61526480f906c2d9b4c6a5d2d28be21d0f96ca62 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 07:51:19 +0200 Subject: [PATCH 066/133] [ZARCH] Fix copy constraint --- kernel/zarch/ccopy.c | 2 +- kernel/zarch/dcopy.c | 2 +- kernel/zarch/scopy.c | 2 +- kernel/zarch/zcopy.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index 1b93a812e..d17bddcc8 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -36,7 +36,7 @@ static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index f7cbf54b2..b6a740c43 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -36,7 +36,7 @@ static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 44d27b062..4e4993737 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -36,7 +36,7 @@ static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n]) x) : "cc"); } diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 2f80cedce..50ff18646 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -36,7 +36,7 @@ static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "la %[x],256(%[x])\n\t" "la %[y],256(%[y])\n\t" "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y) + : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) : "m"(*(const FLOAT (*)[n * 2]) x) : "cc"); } From f4b82d7bc4c20da29c19b2eece602002bd5fe4af Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 13:30:13 +0100 Subject: [PATCH 067/133] Include complex rather than complex.h in C++ contexts to avoid name clashes e.g. with boost headers that use I as a generic placeholder. Fixes #1992 as suggested by aprokop in that issue ticket. --- lapack-netlib/LAPACKE/include/lapacke.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b..11740e113 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -86,7 +86,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From 11a43e81161e5bd3f90e38a1127b1562406e85cd Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 5 Feb 2019 19:17:08 +0200 Subject: [PATCH 068/133] [ZARCH] Set alignment hint for vl/vst --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702..2598145c3 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb..f7e11c3ce 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741..25f018c66 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f..091aceb37 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a9931..641949963 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834..c02ad0aac 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,0(%%r1,%[y]),3\n\t" + "vl %%v21,16(%%r1,%[y]),3\n\t" + "vl %%v22,32(%%r1,%[y]),3\n\t" + "vl %%v23,48(%%r1,%[y]),3\n\t" + "vl %%v24,64(%%r1,%[x]),3\n\t" + "vl %%v25,80(%%r1,%[x]),3\n\t" + "vl %%v26,96(%%r1,%[x]),3\n\t" + "vl %%v27,112(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" + "vst %%v16,0(%%r1,%[y]),3\n\t" + "vst %%v17,16(%%r1,%[y]),3\n\t" + "vst %%v18,32(%%r1,%[y]),3\n\t" + "vst %%v19,48(%%r1,%[y]),3\n\t" + "vst %%v24,64(%%r1,%[y]),3\n\t" + "vst %%v25,80(%%r1,%[y]),3\n\t" + "vst %%v26,96(%%r1,%[y]),3\n\t" + "vst %%v27,112(%%r1,%[y]),3\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,128(%%r1,%[y]),3\n\t" + "vl %%v21,144(%%r1,%[y]),3\n\t" + "vl %%v22,160(%%r1,%[y]),3\n\t" + "vl %%v23,176(%%r1,%[y]),3\n\t" + "vl %%v24,192(%%r1,%[x]),3\n\t" + "vl %%v25,208(%%r1,%[x]),3\n\t" + "vl %%v26,224(%%r1,%[x]),3\n\t" + "vl %%v27,240(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[y]),3\n\t" + "vl %%v29,208(%%r1,%[y]),3\n\t" + "vl %%v30,224(%%r1,%[y]),3\n\t" + "vl %%v31,240(%%r1,%[y]),3\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" + "vst %%v16,128(%%r1,%[y]),3\n\t" + "vst %%v17,144(%%r1,%[y]),3\n\t" + "vst %%v18,160(%%r1,%[y]),3\n\t" + "vst %%v19,176(%%r1,%[y]),3\n\t" + "vst %%v24,192(%%r1,%[y]),3\n\t" + "vst %%v25,208(%%r1,%[y]),3\n\t" + "vst %%v26,224(%%r1,%[y]),3\n\t" + "vst %%v27,240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717..0dd8ed08a 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b54..87ed6ecd1 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v6,32(%%r1,%[y]),3\n\t" + "vl %%v7,48(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v6,32(%%r1,%[y]),3\n\t" + "vst %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[ap0]),3\n\t" + "vl %%v17,64(%%r1,%[ap1]),3\n\t" + "vl %%v18,64(%%r1,%[ap2]),3\n\t" + "vl %%v19,64(%%r1,%[ap3]),3\n\t" + "vl %%v20,80(%%r1,%[ap0]),3\n\t" + "vl %%v21,80(%%r1,%[ap1]),3\n\t" + "vl %%v22,80(%%r1,%[ap2]),3\n\t" + "vl %%v23,80(%%r1,%[ap3]),3\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v4,64(%%r1,%[y]),3\n\t" + "vl %%v5,80(%%r1,%[y]),3\n\t" + "vl %%v6,96(%%r1,%[y]),3\n\t" + "vl %%v7,112(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" + "vst %%v4,64(%%r1,%[y]),3\n\t" + "vst %%v5,80(%%r1,%[y]),3\n\t" + "vst %%v6,96(%%r1,%[y]),3\n\t" + "vst %%v7,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,0(%%r1,%[ap2]),3\n\t" + "vl %%v19,0(%%r1,%[ap3]),3\n\t" + "vl %%v20,16(%%r1,%[ap0]),3\n\t" + "vl %%v21,16(%%r1,%[ap1]),3\n\t" + "vl %%v22,16(%%r1,%[ap2]),3\n\t" + "vl %%v23,16(%%r1,%[ap3]),3\n\t" + "vl %%v4,0(%%r1,%[y]),3\n\t" + "vl %%v5,16(%%r1,%[y]),3\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v4,0(%%r1,%[y]),3\n\t" + "vst %%v5,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v20,32(%%r1,%[ap0]),3\n\t" + "vl %%v21,32(%%r1,%[ap1]),3\n\t" + "vl %%v22,48(%%r1,%[ap0]),3\n\t" + "vl %%v23,48(%%r1,%[ap1]),3\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v4,32(%%r1,%[y]),3\n\t" + "vl %%v5,48(%%r1,%[y]),3\n\t" + "vl %%v6,64(%%r1,%[y]),3\n\t" + "vl %%v7,80(%%r1,%[y]),3\n\t" + "vl %%v8,96(%%r1,%[y]),3\n\t" + "vl %%v9,112(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v4,32(%%r1,%[y]),3\n\t" + "vst %%v5,48(%%r1,%[y]),3\n\t" + "vst %%v6,64(%%r1,%[y]),3\n\t" + "vst %%v7,80(%%r1,%[y]),3\n\t" + "vst %%v8,96(%%r1,%[y]),3\n\t" + "vst %%v9,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0]),3\n\t" + "vl %%v17,0(%%r1,%[ap1]),3\n\t" + "vl %%v18,16(%%r1,%[ap0]),3\n\t" + "vl %%v19,16(%%r1,%[ap1]),3\n\t" + "vl %%v2,0(%%r1,%[y]),3\n\t" + "vl %%v3,16(%%r1,%[y]),3\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v2,0(%%r1,%[y]),3\n\t" + "vst %%v3,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,32(%%r1,%[a0]),3\n\t" + "vl %%v19,48(%%r1,%[a0]),3\n\t" + "vl %%v20,64(%%r1,%[a0]),3\n\t" + "vl %%v21,80(%%r1,%[a0]),3\n\t" + "vl %%v22,96(%%r1,%[a0]),3\n\t" + "vl %%v23,112(%%r1,%[a0]),3\n\t" + "vl %%v24,0(%%r1,%[y]),3\n\t" + "vl %%v25,16(%%r1,%[y]),3\n\t" + "vl %%v26,32(%%r1,%[y]),3\n\t" + "vl %%v27,48(%%r1,%[y]),3\n\t" + "vl %%v28,64(%%r1,%[y]),3\n\t" + "vl %%v29,80(%%r1,%[y]),3\n\t" + "vl %%v30,96(%%r1,%[y]),3\n\t" + "vl %%v31,112(%%r1,%[y]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" + "vst %%v24,0(%%r1,%[y]),3\n\t" + "vst %%v25,16(%%r1,%[y]),3\n\t" + "vst %%v26,32(%%r1,%[y]),3\n\t" + "vst %%v27,48(%%r1,%[y]),3\n\t" + "vst %%v28,64(%%r1,%[y]),3\n\t" + "vst %%v29,80(%%r1,%[y]),3\n\t" + "vst %%v30,96(%%r1,%[y]),3\n\t" + "vst %%v31,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0]),3\n\t" + "vl %%v17,16(%%r1,%[a0]),3\n\t" + "vl %%v18,0(%%r1,%[y]),3\n\t" + "vl %%v19,16(%%r1,%[y]),3\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" + "vst %%v18,0(%%r1,%[y]),3\n\t" + "vst %%v19,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b..9fd3c09d6 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v24,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v25,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v26,32(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v27,32(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v28,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v29,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v30,48(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v31,48(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" + "vl %%v26,64(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" + "vl %%v27,64(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" + "vl %%v28,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" + "vl %%v29,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" + "vl %%v30,80(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" + "vl %%v31,80(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v24,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v25,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v26,96(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v27,96(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v28,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v29,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v30,112(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v31,112(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" + "vl %%v26,0(%%r1,%[ap2]),3\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" + "vl %%v27,0(%%r1,%[ap3]),3\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" + "vl %%v28,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" + "vl %%v29,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" + "vl %%v30,16(%%r1,%[ap2]),3\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" + "vl %%v31,16(%%r1,%[ap3]),3\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" + "vl %%v28,32(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" + "vl %%v29,32(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" + "vl %%v30,48(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" + "vl %%v31,48(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v24,64(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v25,64(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v26,80(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v27,80(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v28,96(%%r1,%[ap0]),3\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v29,96(%%r1,%[ap1]),3\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v30,112(%%r1,%[ap0]),3\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v31,112(%%r1,%[ap1]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[ap0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" + "vl %%v25,0(%%r1,%[ap1]),3\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" + "vl %%v26,16(%%r1,%[ap0]),3\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" + "vl %%v27,16(%%r1,%[ap1]),3\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" + "vl %%v26,32(%%r1,%[a0]),3\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" + "vl %%v27,48(%%r1,%[a0]),3\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" + "vl %%v28,64(%%r1,%[a0]),3\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" + "vl %%v29,80(%%r1,%[a0]),3\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" + "vl %%v30,96(%%r1,%[a0]),3\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" + "vl %%v31,112(%%r1,%[a0]),3\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[a0]),3\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" + "vl %%v25,16(%%r1,%[a0]),3\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,64(%%r1,%[src]),3\n\t" + "vl %%v21,80(%%r1,%[src]),3\n\t" + "vl %%v22,96(%%r1,%[src]),3\n\t" + "vl %%v23,112(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vl %%v26, 32(%%r1,%[dest]),3\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" + "vst %%v26, 32(%%r1,%[dest]),3\n\t" + "vl %%v27, 48(%%r1,%[dest]),3\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" + "vst %%v27, 48(%%r1,%[dest]),3\n\t" + "vl %%v28, 64(%%r1,%[dest]),3\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" + "vst %%v28, 64(%%r1,%[dest]),3\n\t" + "vl %%v29, 80(%%r1,%[dest]),3\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" + "vst %%v29, 80(%%r1,%[dest]),3\n\t" + "vl %%v30, 96(%%r1,%[dest]),3\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v30, 96(%%r1,%[dest]),3\n\t" + "vl %%v31, 112(%%r1,%[dest]),3\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" + "vst %%v31, 112(%%r1,%[dest]),3\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v24, 0(%%r1,%[dest]),3\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v25, 16(%%r1,%[dest]),3\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" + "vst %%v25, 16(%%r1,%[dest]),3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01..cc0f23c87 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55..83d827d35 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262c..754828b7c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v24,128(%%r1,%[x]),3\n\t" + "vl %%v25,144(%%r1,%[x]),3\n\t" + "vl %%v26,160(%%r1,%[x]),3\n\t" + "vl %%v27,176(%%r1,%[x]),3\n\t" + "vl %%v28,192(%%r1,%[x]),3\n\t" + "vl %%v29,208(%%r1,%[x]),3\n\t" + "vl %%v30,224(%%r1,%[x]),3\n\t" + "vl %%v31,240(%%r1,%[x]),3\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f..ff0fca48c 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f02..de2207fcd 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5..bc58569d5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x]),3\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" + "vst %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v25,16(%%r1,%[x]),3\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" + "vst %%v25,16(%%r1,%[x]),3\n\t" + "vl %%v26,32(%%r1,%[x]),3\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" + "vst %%v26,32(%%r1,%[x]),3\n\t" + "vl %%v27,48(%%r1,%[x]),3\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" + "vst %%v27,48(%%r1,%[x]),3\n\t" + "vl %%v28,64(%%r1,%[x]),3\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" + "vst %%v28,64(%%r1,%[x]),3\n\t" + "vl %%v29,80(%%r1,%[x]),3\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" + "vst %%v29,80(%%r1,%[x]),3\n\t" + "vl %%v30,96(%%r1,%[x]),3\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" + "vst %%v30,96(%%r1,%[x]),3\n\t" + "vl %%v31,112(%%r1,%[x]),3\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" + "vst %%v31,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd6..f4da46dc1 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f..bd0f18115 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c2..4884d1e3a 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba437..a6b95bf3e 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e4..c3f36d964 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x])\n\t" + __asm__("vl %%v0,0(%[x]),3\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" + "vl %%v16,128(%%r1,%[x]),3\n\t" + "vl %%v17,144(%%r1,%[x]),3\n\t" + "vl %%v18,160(%%r1,%[x]),3\n\t" + "vl %%v19,176(%%r1,%[x]),3\n\t" + "vl %%v20,192(%%r1,%[x]),3\n\t" + "vl %%v21,208(%%r1,%[x]),3\n\t" + "vl %%v22,224(%%r1,%[x]),3\n\t" + "vl %%v23,240(%%r1,%[x]),3\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b..83e5e93c9 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[x]),3\n\t" + "vl %%v17, 144(%%r1,%[x]),3\n\t" + "vl %%v18, 160(%%r1,%[x]),3\n\t" + "vl %%v19, 176(%%r1,%[x]),3\n\t" + "vl %%v20, 192(%%r1,%[x]),3\n\t" + "vl %%v21, 208(%%r1,%[x]),3\n\t" + "vl %%v22, 224(%%r1,%[x]),3\n\t" + "vl %%v23, 240(%%r1,%[x]),3\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d..77bb09a2e 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x]),3\n\t" + "vl %%v9,16(%%r1,%[x]),3\n\t" + "vl %%v10,32(%%r1,%[x]),3\n\t" + "vl %%v11,48(%%r1,%[x]),3\n\t" + "vl %%v12,0(%%r1,%[y]),3\n\t" + "vl %%v13,16(%%r1,%[y]),3\n\t" + "vl %%v14,32(%%r1,%[y]),3\n\t" + "vl %%v15,48(%%r1,%[y]),3\n\t" + "vl %%v16,64(%%r1,%[x]),3\n\t" + "vl %%v17,80(%%r1,%[x]),3\n\t" + "vl %%v18,96(%%r1,%[x]),3\n\t" + "vl %%v19,112(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[y]),3\n\t" + "vl %%v21,80(%%r1,%[y]),3\n\t" + "vl %%v22,96(%%r1,%[y]),3\n\t" + "vl %%v23,112(%%r1,%[y]),3\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" + "vst %%v8,0(%%r1,%[y]),3\n\t" + "vst %%v9,16(%%r1,%[y]),3\n\t" + "vst %%v10,32(%%r1,%[y]),3\n\t" + "vst %%v11,48(%%r1,%[y]),3\n\t" + "vst %%v16,64(%%r1,%[y]),3\n\t" + "vst %%v17,80(%%r1,%[y]),3\n\t" + "vst %%v18,96(%%r1,%[y]),3\n\t" + "vst %%v19,112(%%r1,%[y]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734..8cfbaadb8 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" + "vl %%v16, 64(%%r1,%[x]),3\n\t" + "vl %%v17, 80(%%r1,%[x]),3\n\t" + "vl %%v18, 96(%%r1,%[x]),3\n\t" + "vl %%v19, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 64(%%r1,%[y]),3\n\t" + "vl %%v1, 80(%%r1,%[y]),3\n\t" + "vl %%v2, 96(%%r1,%[y]),3\n\t" + "vl %%v3, 112(%%r1,%[y]),3\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ec..4b64fc8a5 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" + "vl %%v18,32(%[x]),3\n\t" + "vl %%v19,48(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" + "vl %%v17,16(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" + __asm__("vl %%v16,0(%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y]),3\n\t" + "vl %%v1,16(%%r1,%[y]),3\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" + "vst %%v0,0(%%r1,%[y]),3\n\t" + "vst %%v1,16(%%r1,%[y]),3\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src]),3\n\t" + "vl %%v17,16(%%r1,%[src]),3\n\t" + "vl %%v18,32(%%r1,%[src]),3\n\t" + "vl %%v19,48(%%r1,%[src]),3\n\t" + "vl %%v20,0(%%r1,%[dest]),3\n\t" + "vl %%v21,16(%%r1,%[dest]),3\n\t" + "vl %%v22,32(%%r1,%[dest]),3\n\t" + "vl %%v23,48(%%r1,%[dest]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" + "vst %%v28,0(%%r1,%[dest]),3\n\t" + "vst %%v29,16(%%r1,%[dest]),3\n\t" + "vst %%v30,32(%%r1,%[dest]),3\n\t" + "vst %%v31,48(%%r1,%[dest]),3\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc..429824dcf 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" + "vl %%v26,0(%[y]),3\n\t" + "vl %%v27,16(%[y]),3\n\t" + "vl %%v28,32(%[y]),3\n\t" + "vl %%v29,48(%[y]),3\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" + "vst %%v26,0(%[y]),3\n\t" + "vst %%v27,16(%[y]),3\n\t" + "vst %%v28,32(%[y]),3\n\t" + "vst %%v29,48(%[y]),3" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" + "vl %%v22,0(%[y]),3\n\t" + "vl %%v23,16(%[y]),3\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" + "vst %%v22,0(%[y]),3\n\t" + "vst %%v23,16(%[y]),3\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vl %%v0,16(%%r1,%[x]),3\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" + "vl %%v0,0(%[y]),3\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" + "vst %%v0,0(%[y]),3\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f16605..ea81e4741 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x]),3\n\t" + "vl %%v25, 16(%%r1,%[x]),3\n\t" + "vl %%v26, 32(%%r1,%[x]),3\n\t" + "vl %%v27, 48(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[y]),3\n\t" + "vl %%v17, 16(%%r1,%[y]),3\n\t" + "vl %%v18, 32(%%r1,%[y]),3\n\t" + "vl %%v19, 48(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" + "vst %%v28, 0(%%r1,%[x]),3\n\t" + "vst %%v29, 16(%%r1,%[x]),3\n\t" + "vst %%v30, 32(%%r1,%[x]),3\n\t" + "vst %%v31, 48(%%r1,%[x]),3\n\t" + "vst %%v20, 0(%%r1,%[y]),3\n\t" + "vst %%v21, 16(%%r1,%[y]),3\n\t" + "vst %%v22, 32(%%r1,%[y]),3\n\t" + "vst %%v23, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 64(%%r1,%[x]),3\n\t" + "vl %%v25, 80(%%r1,%[x]),3\n\t" + "vl %%v26, 96(%%r1,%[x]),3\n\t" + "vl %%v27, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 64(%%r1,%[y]),3\n\t" + "vl %%v17, 80(%%r1,%[y]),3\n\t" + "vl %%v18, 96(%%r1,%[y]),3\n\t" + "vl %%v19, 112(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" + "vst %%v28, 64(%%r1,%[x]),3\n\t" + "vst %%v29, 80(%%r1,%[x]),3\n\t" + "vst %%v30, 96(%%r1,%[x]),3\n\t" + "vst %%v31, 112(%%r1,%[x]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[y]),3\n\t" + "vl %%v17, 144(%%r1,%[y]),3\n\t" + "vl %%v18, 160(%%r1,%[y]),3\n\t" + "vl %%v19, 176(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" + "vst %%v28, 128(%%r1,%[x]),3\n\t" + "vst %%v29, 144(%%r1,%[x]),3\n\t" + "vst %%v30, 160(%%r1,%[x]),3\n\t" + "vst %%v31, 176(%%r1,%[x]),3\n\t" + "vst %%v20, 128(%%r1,%[y]),3\n\t" + "vst %%v21, 144(%%r1,%[y]),3\n\t" + "vst %%v22, 160(%%r1,%[y]),3\n\t" + "vst %%v23, 176(%%r1,%[y]),3\n\t" + "vl %%v24, 192(%%r1,%[x]),3\n\t" + "vl %%v25, 208(%%r1,%[x]),3\n\t" + "vl %%v26, 224(%%r1,%[x]),3\n\t" + "vl %%v27, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 192(%%r1,%[y]),3\n\t" + "vl %%v17, 208(%%r1,%[y]),3\n\t" + "vl %%v18, 224(%%r1,%[y]),3\n\t" + "vl %%v19, 240(%%r1,%[y]),3\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[x]),3\n\t" + "vst %%v29, 208(%%r1,%[x]),3\n\t" + "vst %%v30, 224(%%r1,%[x]),3\n\t" + "vst %%v31, 240(%%r1,%[x]),3\n\t" + "vst %%v20, 192(%%r1,%[y]),3\n\t" + "vst %%v21, 208(%%r1,%[y]),3\n\t" + "vst %%v22, 224(%%r1,%[y]),3\n\t" + "vst %%v23, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9..7fd62a1ac 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x]),3\n\t" + "vl %%v17,16(%%r1,%[x]),3\n\t" + "vl %%v18,32(%%r1,%[x]),3\n\t" + "vl %%v19,48(%%r1,%[x]),3\n\t" + "vl %%v20,64(%%r1,%[x]),3\n\t" + "vl %%v21,80(%%r1,%[x]),3\n\t" + "vl %%v22,96(%%r1,%[x]),3\n\t" + "vl %%v23,112(%%r1,%[x]),3\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" + "vst %%v16,0(%%r1,%[x]),3\n\t" + "vst %%v17,16(%%r1,%[x]),3\n\t" + "vst %%v18,32(%%r1,%[x]),3\n\t" + "vst %%v19,48(%%r1,%[x]),3\n\t" + "vst %%v20,64(%%r1,%[x]),3\n\t" + "vst %%v21,80(%%r1,%[x]),3\n\t" + "vst %%v22,96(%%r1,%[x]),3\n\t" + "vst %%v23,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x]),3\n\t" + "vst %%v0,16(%%r1,%[x]),3\n\t" + "vst %%v0,32(%%r1,%[x]),3\n\t" + "vst %%v0,48(%%r1,%[x]),3\n\t" + "vst %%v0,64(%%r1,%[x]),3\n\t" + "vst %%v0,80(%%r1,%[x]),3\n\t" + "vst %%v0,96(%%r1,%[x]),3\n\t" + "vst %%v0,112(%%r1,%[x]),3\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be..0252ab8db 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x]),3\n\t" + "vl %%v17, 16(%%r1,%[x]),3\n\t" + "vl %%v18, 32(%%r1,%[x]),3\n\t" + "vl %%v19, 48(%%r1,%[x]),3\n\t" + "vl %%v20, 64(%%r1,%[x]),3\n\t" + "vl %%v21, 80(%%r1,%[x]),3\n\t" + "vl %%v22, 96(%%r1,%[x]),3\n\t" + "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v24, 128(%%r1,%[x]),3\n\t" + "vl %%v25, 144(%%r1,%[x]),3\n\t" + "vl %%v26, 160(%%r1,%[x]),3\n\t" + "vl %%v27, 176(%%r1,%[x]),3\n\t" + "vl %%v28, 192(%%r1,%[x]),3\n\t" + "vl %%v29, 208(%%r1,%[x]),3\n\t" + "vl %%v30, 224(%%r1,%[x]),3\n\t" + "vl %%v31, 240(%%r1,%[x]),3\n\t" + "vl %%v0, 0(%%r1,%[y]),3\n\t" + "vl %%v1, 16(%%r1,%[y]),3\n\t" + "vl %%v2, 32(%%r1,%[y]),3\n\t" + "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v4, 64(%%r1,%[y]),3\n\t" + "vl %%v5, 80(%%r1,%[y]),3\n\t" + "vl %%v6, 96(%%r1,%[y]),3\n\t" + "vl %%v7, 112(%%r1,%[y]),3\n\t" + "vst %%v0, 0(%%r1,%[x]),3\n\t" + "vst %%v1, 16(%%r1,%[x]),3\n\t" + "vst %%v2, 32(%%r1,%[x]),3\n\t" + "vst %%v3, 48(%%r1,%[x]),3\n\t" + "vst %%v4, 64(%%r1,%[x]),3\n\t" + "vst %%v5, 80(%%r1,%[x]),3\n\t" + "vst %%v6, 96(%%r1,%[x]),3\n\t" + "vst %%v7, 112(%%r1,%[x]),3\n\t" + "vl %%v0, 128(%%r1,%[y]),3\n\t" + "vl %%v1, 144(%%r1,%[y]),3\n\t" + "vl %%v2, 160(%%r1,%[y]),3\n\t" + "vl %%v3, 176(%%r1,%[y]),3\n\t" + "vl %%v4, 192(%%r1,%[y]),3\n\t" + "vl %%v5, 208(%%r1,%[y]),3\n\t" + "vl %%v6, 224(%%r1,%[y]),3\n\t" + "vl %%v7, 240(%%r1,%[y]),3\n\t" + "vst %%v0, 128(%%r1,%[x]),3\n\t" + "vst %%v1, 144(%%r1,%[x]),3\n\t" + "vst %%v2, 160(%%r1,%[x]),3\n\t" + "vst %%v3, 176(%%r1,%[x]),3\n\t" + "vst %%v4, 192(%%r1,%[x]),3\n\t" + "vst %%v5, 208(%%r1,%[x]),3\n\t" + "vst %%v6, 224(%%r1,%[x]),3\n\t" + "vst %%v7, 240(%%r1,%[x]),3\n\t" + "vst %%v16, 0(%%r1,%[y]),3\n\t" + "vst %%v17, 16(%%r1,%[y]),3\n\t" + "vst %%v18, 32(%%r1,%[y]),3\n\t" + "vst %%v19, 48(%%r1,%[y]),3\n\t" + "vst %%v20, 64(%%r1,%[y]),3\n\t" + "vst %%v21, 80(%%r1,%[y]),3\n\t" + "vst %%v22, 96(%%r1,%[y]),3\n\t" + "vst %%v23, 112(%%r1,%[y]),3\n\t" + "vst %%v24, 128(%%r1,%[y]),3\n\t" + "vst %%v25, 144(%%r1,%[y]),3\n\t" + "vst %%v26, 160(%%r1,%[y]),3\n\t" + "vst %%v27, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[y]),3\n\t" + "vst %%v29, 208(%%r1,%[y]),3\n\t" + "vst %%v30, 224(%%r1,%[y]),3\n\t" + "vst %%v31, 240(%%r1,%[y]),3\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 1391fc46d2c38bb74ed69b7a527ab8865161c915 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 19:29:33 +0100 Subject: [PATCH 069/133] fix second instance of complex.h for c++ as well --- lapack-netlib/LAPACKE/include/lapacke.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 6ded78c8b..c5ea465e0 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -70,7 +70,11 @@ /* Complex type (single precision) */ #ifndef lapack_complex_float +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_float float _Complex #endif @@ -86,7 +90,11 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double +#ifndef __cplusplus #include +#else +#include +#endif #define lapack_complex_double double _Complex #endif From d70ae3ab433bda46708f02bf74c03c861bfb546f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 20:06:34 +0100 Subject: [PATCH 070/133] Make c_check robust against old or incomplete perl installations by catching and working around failures to load modules, and avoiding object-oriented syntax in tempfile creation. Fixes #1989 --- c_check | 85 +++++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 56 insertions(+), 29 deletions(-) diff --git a/c_check b/c_check index 9dc237beb..38f9170ca 100644 --- a/c_check +++ b/c_check @@ -1,7 +1,7 @@ #!/usr/bin/perl -use File::Basename; -use File::Temp qw(tempfile); +#use File::Basename; +# use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); -$tmpf = new File::Temp( UNLINK => 1 ); +#$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -31,12 +31,25 @@ if ($?) { $cross_suffix = ""; -if (dirname($compiler_name) ne ".") { - $cross_suffix .= dirname($compiler_name) . "/"; -} +eval "use File::Basename"; +if ($@){ + warn "could not load PERL module File::Basename, emulating its functionality"; + my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); + if ($dirnam ne ".") { + $cross_suffix .= $dirnam . "/"; + } + my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); + if ($basnam =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } +} else { + if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; + } -if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { - $cross_suffix .= $1; + if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; + } } $compiler = ""; @@ -171,20 +184,26 @@ if ($?) { $have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { - $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; - print $tmpf "#include \n\n"; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); - system(@cmd) == 0; - if ($? != 0) { - $have_msa = 0; + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check MSA capatibility"; } else { - $have_msa = 1; + $tmpf = new File::Temp( UNLINK => 1 ); + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $have_msa = 0; + } else { + $have_msa = 1; + } + unlink("$tmpf.o"); } - unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); @@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); - system(@cmd) == 0; - if ($? != 0) { - $no_avx512 = 1; - } else { + eval "use File::Temp qw(tempfile)"; + if ($@){ + warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; $no_avx512 = 0; + } else { +# $tmpf = new File::Temp( UNLINK => 1 ); + ($fh,$tmpf) = tempfile( UNLINK => 1 ); + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; + } else { + $no_avx512 = 0; + } + unlink("tmpf.o"); } - unlink("tmpf.o"); } $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; From 5952e586ceaa7ea68376f1580c6c96edca55804b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Feb 2019 23:51:40 +0100 Subject: [PATCH 071/133] Support DYNAMIC_LIST option in cmake e.g. cmake -DDYNAMIC_ARCH=1 -DDYNAMIC_LIST="NEHALEM;HASWELL;ZEN" .. original issue was #1639 --- cmake/arch.cmake | 3 +++ cmake/system.cmake | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 63fb86fa2..470ea2a8f 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) endif () + if (DYNAMIC_LIST) + set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) + endif () endif () if (NOT DYNAMIC_CORE) diff --git a/cmake/system.cmake b/cmake/system.cmake index 4cee7bd18..7fda2adb9 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -187,6 +187,13 @@ if (DYNAMIC_ARCH) endif () endif () +if (DYNAMIC_LIST) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") + foreach(DCORE ${DYNAMIC_LIST}) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") + endforeach () +endif () + if (NO_LAPACK) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") #Disable LAPACK C interface From 70397701652743587a88b20837c3b6e2c1da74f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 6 Feb 2019 20:11:44 +0200 Subject: [PATCH 072/133] [ZARCH] Undo the last commit --- kernel/zarch/damax.c | 34 ++--- kernel/zarch/damax_z13.c | 34 ++--- kernel/zarch/damin.c | 34 ++--- kernel/zarch/damin_z13.c | 34 ++--- kernel/zarch/dasum.c | 32 ++--- kernel/zarch/daxpy.c | 96 +++++++-------- kernel/zarch/ddot.c | 32 ++--- kernel/zarch/dgemv_n_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dgemv_t_4.c | 260 +++++++++++++++++++-------------------- kernel/zarch/dmax.c | 34 ++--- kernel/zarch/dmax_z13.c | 34 ++--- kernel/zarch/dmin.c | 34 ++--- kernel/zarch/dmin_z13.c | 34 ++--- kernel/zarch/drot.c | 128 +++++++++---------- kernel/zarch/dscal.c | 48 ++++---- kernel/zarch/dswap.c | 128 +++++++++---------- kernel/zarch/idamax.c | 34 ++--- kernel/zarch/idamin.c | 34 ++--- kernel/zarch/idmax.c | 34 ++--- kernel/zarch/idmin.c | 34 ++--- kernel/zarch/zasum.c | 32 ++--- kernel/zarch/zaxpy.c | 48 ++++---- kernel/zarch/zdot.c | 32 ++--- kernel/zarch/zgemv_n_4.c | 62 +++++----- kernel/zarch/zgemv_t_4.c | 40 +++--- kernel/zarch/zrot.c | 128 +++++++++---------- kernel/zarch/zscal.c | 112 ++++++++--------- kernel/zarch/zswap.c | 128 +++++++++---------- 28 files changed, 987 insertions(+), 987 deletions(-) diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 2598145c3..37008f702 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,8\n\t" "vfmaxdb %%v17,%%v17,%%v25,8\n\t" "vfmaxdb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index f7e11c3ce..530d6e5bb 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index 25f018c66..a01791741 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -33,27 +33,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,8\n\t" "vfmindb %%v17,%%v17,%%v25,8\n\t" "vfmindb %%v18,%%v18,%%v26,8\n\t" diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 091aceb37..2172b6d6f 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -33,20 +33,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -71,14 +71,14 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 641949963..9f69a9931 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -45,14 +45,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index c02ad0aac..179ef8834 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -34,22 +34,22 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,0(%%r1,%[y]),3\n\t" - "vl %%v21,16(%%r1,%[y]),3\n\t" - "vl %%v22,32(%%r1,%[y]),3\n\t" - "vl %%v23,48(%%r1,%[y]),3\n\t" - "vl %%v24,64(%%r1,%[x]),3\n\t" - "vl %%v25,80(%%r1,%[x]),3\n\t" - "vl %%v26,96(%%r1,%[x]),3\n\t" - "vl %%v27,112(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -58,30 +58,30 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y]),3\n\t" - "vst %%v17,16(%%r1,%[y]),3\n\t" - "vst %%v18,32(%%r1,%[y]),3\n\t" - "vst %%v19,48(%%r1,%[y]),3\n\t" - "vst %%v24,64(%%r1,%[y]),3\n\t" - "vst %%v25,80(%%r1,%[y]),3\n\t" - "vst %%v26,96(%%r1,%[y]),3\n\t" - "vst %%v27,112(%%r1,%[y]),3\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,128(%%r1,%[y]),3\n\t" - "vl %%v21,144(%%r1,%[y]),3\n\t" - "vl %%v22,160(%%r1,%[y]),3\n\t" - "vl %%v23,176(%%r1,%[y]),3\n\t" - "vl %%v24,192(%%r1,%[x]),3\n\t" - "vl %%v25,208(%%r1,%[x]),3\n\t" - "vl %%v26,224(%%r1,%[x]),3\n\t" - "vl %%v27,240(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[y]),3\n\t" - "vl %%v29,208(%%r1,%[y]),3\n\t" - "vl %%v30,224(%%r1,%[y]),3\n\t" - "vl %%v31,240(%%r1,%[y]),3\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" @@ -90,14 +90,14 @@ static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y]),3\n\t" - "vst %%v17,144(%%r1,%[y]),3\n\t" - "vst %%v18,160(%%r1,%[y]),3\n\t" - "vst %%v19,176(%%r1,%[y]),3\n\t" - "vst %%v24,192(%%r1,%[y]),3\n\t" - "vst %%v25,208(%%r1,%[y]),3\n\t" - "vst %%v26,224(%%r1,%[y]),3\n\t" - "vst %%v27,240(%%r1,%[y]),3\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 0dd8ed08a..f5f601717 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -43,22 +43,22 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[x])\n\t" "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 87ed6ecd1..c93ff9b54 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -52,26 +52,26 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" - "vl %%v6,32(%%r1,%[y]),3\n\t" - "vl %%v7,48(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -88,30 +88,30 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" - "vst %%v6,32(%%r1,%[y]),3\n\t" - "vst %%v7,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[ap0]),3\n\t" - "vl %%v17,64(%%r1,%[ap1]),3\n\t" - "vl %%v18,64(%%r1,%[ap2]),3\n\t" - "vl %%v19,64(%%r1,%[ap3]),3\n\t" - "vl %%v20,80(%%r1,%[ap0]),3\n\t" - "vl %%v21,80(%%r1,%[ap1]),3\n\t" - "vl %%v22,80(%%r1,%[ap2]),3\n\t" - "vl %%v23,80(%%r1,%[ap3]),3\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" - "vl %%v4,64(%%r1,%[y]),3\n\t" - "vl %%v5,80(%%r1,%[y]),3\n\t" - "vl %%v6,96(%%r1,%[y]),3\n\t" - "vl %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" @@ -128,10 +128,10 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y]),3\n\t" - "vst %%v5,80(%%r1,%[y]),3\n\t" - "vst %%v6,96(%%r1,%[y]),3\n\t" - "vst %%v7,112(%%r1,%[y]),3\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -141,16 +141,16 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,0(%%r1,%[ap2]),3\n\t" - "vl %%v19,0(%%r1,%[ap3]),3\n\t" - "vl %%v20,16(%%r1,%[ap0]),3\n\t" - "vl %%v21,16(%%r1,%[ap1]),3\n\t" - "vl %%v22,16(%%r1,%[ap2]),3\n\t" - "vl %%v23,16(%%r1,%[ap3]),3\n\t" - "vl %%v4,0(%%r1,%[y]),3\n\t" - "vl %%v5,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" @@ -159,8 +159,8 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y]),3\n\t" - "vst %%v5,16(%%r1,%[y]),3\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -193,30 +193,30 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v20,32(%%r1,%[ap0]),3\n\t" - "vl %%v21,32(%%r1,%[ap1]),3\n\t" - "vl %%v22,48(%%r1,%[ap0]),3\n\t" - "vl %%v23,48(%%r1,%[ap1]),3\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" - "vl %%v4,32(%%r1,%[y]),3\n\t" - "vl %%v5,48(%%r1,%[y]),3\n\t" - "vl %%v6,64(%%r1,%[y]),3\n\t" - "vl %%v7,80(%%r1,%[y]),3\n\t" - "vl %%v8,96(%%r1,%[y]),3\n\t" - "vl %%v9,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" @@ -233,14 +233,14 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" - "vst %%v4,32(%%r1,%[y]),3\n\t" - "vst %%v5,48(%%r1,%[y]),3\n\t" - "vst %%v6,64(%%r1,%[y]),3\n\t" - "vst %%v7,80(%%r1,%[y]),3\n\t" - "vst %%v8,96(%%r1,%[y]),3\n\t" - "vst %%v9,112(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -250,18 +250,18 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[ap0]),3\n\t" - "vl %%v17,0(%%r1,%[ap1]),3\n\t" - "vl %%v18,16(%%r1,%[ap0]),3\n\t" - "vl %%v19,16(%%r1,%[ap1]),3\n\t" - "vl %%v2,0(%%r1,%[y]),3\n\t" - "vl %%v3,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y]),3\n\t" - "vst %%v3,16(%%r1,%[y]),3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" @@ -289,22 +289,22 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,32(%%r1,%[a0]),3\n\t" - "vl %%v19,48(%%r1,%[a0]),3\n\t" - "vl %%v20,64(%%r1,%[a0]),3\n\t" - "vl %%v21,80(%%r1,%[a0]),3\n\t" - "vl %%v22,96(%%r1,%[a0]),3\n\t" - "vl %%v23,112(%%r1,%[a0]),3\n\t" - "vl %%v24,0(%%r1,%[y]),3\n\t" - "vl %%v25,16(%%r1,%[y]),3\n\t" - "vl %%v26,32(%%r1,%[y]),3\n\t" - "vl %%v27,48(%%r1,%[y]),3\n\t" - "vl %%v28,64(%%r1,%[y]),3\n\t" - "vl %%v29,80(%%r1,%[y]),3\n\t" - "vl %%v30,96(%%r1,%[y]),3\n\t" - "vl %%v31,112(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" @@ -313,14 +313,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y]),3\n\t" - "vst %%v25,16(%%r1,%[y]),3\n\t" - "vst %%v26,32(%%r1,%[y]),3\n\t" - "vst %%v27,48(%%r1,%[y]),3\n\t" - "vst %%v28,64(%%r1,%[y]),3\n\t" - "vst %%v29,80(%%r1,%[y]),3\n\t" - "vst %%v30,96(%%r1,%[y]),3\n\t" - "vst %%v31,112(%%r1,%[y]),3\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -330,14 +330,14 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[a0]),3\n\t" - "vl %%v17,16(%%r1,%[a0]),3\n\t" - "vl %%v18,0(%%r1,%[y]),3\n\t" - "vl %%v19,16(%%r1,%[y]),3\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y]),3\n\t" - "vst %%v19,16(%%r1,%[y]),3\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 9fd3c09d6..24680cf1b 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -50,77 +50,77 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0]),3\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1]),3\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2]),3\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3]),3\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0]),3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1]),3\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2]),3\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3]),3\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2]),3\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3]),3\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0]),3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1]),3\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2]),3\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3]),3\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0]),3\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1]),3\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2]),3\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3]),3\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0]),3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1]),3\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2]),3\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3]),3\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -131,23 +131,23 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2]),3\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3]),3\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0]),3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1]),3\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2]),3\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3]),3\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -198,45 +198,45 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0]),3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1]),3\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0]),3\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1]),3\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0]),3\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1]),3\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0]),3\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1]),3\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0]),3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1]),3\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0]),3\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1]),3\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -247,15 +247,15 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[ap0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1]),3\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0]),3\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1]),3\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -299,29 +299,29 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[a0])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0]),3\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0]),3\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0]),3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0]),3\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0]),3\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0]),3\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" @@ -332,11 +332,11 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v24,0(%%r1,%[a0]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0]),3\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" @@ -378,38 +378,38 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,64(%%r1,%[src]),3\n\t" - "vl %%v21,80(%%r1,%[src]),3\n\t" - "vl %%v22,96(%%r1,%[src]),3\n\t" - "vl %%v23,112(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" - "vl %%v26, 32(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest]),3\n\t" - "vl %%v27, 48(%%r1,%[dest]),3\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest]),3\n\t" - "vl %%v28, 64(%%r1,%[dest]),3\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest]),3\n\t" - "vl %%v29, 80(%%r1,%[dest]),3\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest]),3\n\t" - "vl %%v30, 96(%%r1,%[dest]),3\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest]),3\n\t" - "vl %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest]),3\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" "agfi %%r1,128\n\t" "brctg %%r0,0b\n\t" "1:\n\t" @@ -419,14 +419,14 @@ static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { "jz 3f\n\t" "srlg %%r0,%%r0,2\n\t" "2:\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v24, 0(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest]),3\n\t" - "vl %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest]),3\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" "agfi %%r1,32\n\t" "brctg %%r0,2b\n\t" "3:\n\t" diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index cc0f23c87..65ed31f01 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmaxdb %%v16,%%v16,%%v24,0\n\t" "vfmaxdb %%v17,%%v17,%%v25,0\n\t" "vfmaxdb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 83d827d35..87bccbe55 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" @@ -59,14 +59,14 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v30,%%v0\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v16,%%v17\n\t" "vfchdb %%v25,%%v18,%%v19\n\t" "vfchdb %%v26,%%v20,%%v21\n\t" diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 754828b7c..518cc262c 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -30,27 +30,27 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" - "vl %%v24,128(%%r1,%[x]),3\n\t" - "vl %%v25,144(%%r1,%[x]),3\n\t" - "vl %%v26,160(%%r1,%[x]),3\n\t" - "vl %%v27,176(%%r1,%[x]),3\n\t" - "vl %%v28,192(%%r1,%[x]),3\n\t" - "vl %%v29,208(%%r1,%[x]),3\n\t" - "vl %%v30,224(%%r1,%[x]),3\n\t" - "vl %%v31,240(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" "vfmindb %%v16,%%v16,%%v24,0\n\t" "vfmindb %%v17,%%v17,%%v25,0\n\t" "vfmindb %%v18,%%v18,%%v26,0\n\t" diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index ff0fca48c..91561992f 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -30,19 +30,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "srlg %[n],%[n],5\n\t" "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" @@ -59,14 +59,14 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { "vsel %%v30,%%v28,%%v29,%%v30\n\t" "vfchdb %%v31,%%v0,%%v30\n\t" "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchdb %%v24,%%v17,%%v16\n\t" "vfchdb %%v25,%%v19,%%v18\n\t" "vfchdb %%v26,%%v21,%%v20\n\t" diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index de2207fcd..8f0197f02 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -35,14 +35,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index bc58569d5..c944990b5 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -33,30 +33,30 @@ static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x]),3\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x]),3\n\t" - "vl %%v25,16(%%r1,%[x]),3\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x]),3\n\t" - "vl %%v26,32(%%r1,%[x]),3\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x]),3\n\t" - "vl %%v27,48(%%r1,%[x]),3\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x]),3\n\t" - "vl %%v28,64(%%r1,%[x]),3\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x]),3\n\t" - "vl %%v29,80(%%r1,%[x]),3\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x]),3\n\t" - "vl %%v30,96(%%r1,%[x]),3\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x]),3\n\t" - "vl %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x]),3\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) @@ -71,14 +71,14 @@ static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f4da46dc1..60ba40bd6 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -33,70 +33,70 @@ static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index bd0f18115..8434c811f 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 4884d1e3a..80a37e6c2 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -33,7 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vflpdb %%v0,%%v0\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" @@ -59,14 +59,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -101,14 +101,14 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index a6b95bf3e..18cdba437 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v16,%%v17\n\t" "vfchedb %%v5,%%v18,%%v19\n\t" "vfchedb %%v6,%%v20,%%v21\n\t" diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index c3f36d964..02ca427e4 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; - __asm__("vl %%v0,0(%[x]),3\n\t" + __asm__("vl %%v0,0(%[x])\n\t" "vleig %%v1,0,0\n\t" "vleig %%v1,1,1\n\t" "vrepig %%v2,16\n\t" @@ -55,14 +55,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" @@ -89,14 +89,14 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { "vsel %%v0,%%v0,%%v16,%%v5\n\t" "vsel %%v1,%%v1,%%v4,%%v5\n\t" "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x]),3\n\t" - "vl %%v17,144(%%r1,%[x]),3\n\t" - "vl %%v18,160(%%r1,%[x]),3\n\t" - "vl %%v19,176(%%r1,%[x]),3\n\t" - "vl %%v20,192(%%r1,%[x]),3\n\t" - "vl %%v21,208(%%r1,%[x]),3\n\t" - "vl %%v22,224(%%r1,%[x]),3\n\t" - "vl %%v23,240(%%r1,%[x]),3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" "vfchedb %%v4,%%v17,%%v16\n\t" "vfchedb %%v5,%%v19,%%v18\n\t" "vfchedb %%v6,%%v21,%%v20\n\t" diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 83e5e93c9..43ae8ff8b 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -45,14 +45,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" @@ -69,14 +69,14 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { "vfadb %%v29,%%v29,%%v21\n\t" "vfadb %%v30,%%v30,%%v22\n\t" "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x]),3\n\t" - "vl %%v17, 144(%%r1,%[x]),3\n\t" - "vl %%v18, 160(%%r1,%[x]),3\n\t" - "vl %%v19, 176(%%r1,%[x]),3\n\t" - "vl %%v20, 192(%%r1,%[x]),3\n\t" - "vl %%v21, 208(%%r1,%[x]),3\n\t" - "vl %%v22, 224(%%r1,%[x]),3\n\t" - "vl %%v23, 240(%%r1,%[x]),3\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" "vflpdb %%v16, %%v16\n\t" "vflpdb %%v17, %%v17\n\t" "vflpdb %%v18, %%v18\n\t" diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 77bb09a2e..31549849d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -45,22 +45,22 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x]),3\n\t" - "vl %%v9,16(%%r1,%[x]),3\n\t" - "vl %%v10,32(%%r1,%[x]),3\n\t" - "vl %%v11,48(%%r1,%[x]),3\n\t" - "vl %%v12,0(%%r1,%[y]),3\n\t" - "vl %%v13,16(%%r1,%[y]),3\n\t" - "vl %%v14,32(%%r1,%[y]),3\n\t" - "vl %%v15,48(%%r1,%[y]),3\n\t" - "vl %%v16,64(%%r1,%[x]),3\n\t" - "vl %%v17,80(%%r1,%[x]),3\n\t" - "vl %%v18,96(%%r1,%[x]),3\n\t" - "vl %%v19,112(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[y]),3\n\t" - "vl %%v21,80(%%r1,%[y]),3\n\t" - "vl %%v22,96(%%r1,%[y]),3\n\t" - "vl %%v23,112(%%r1,%[y]),3\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" "vpdi %%v24,%%v8,%%v8,4\n\t" "vpdi %%v25,%%v9,%%v9,4\n\t" "vpdi %%v26,%%v10,%%v10,4\n\t" @@ -85,14 +85,14 @@ static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y]),3\n\t" - "vst %%v9,16(%%r1,%[y]),3\n\t" - "vst %%v10,32(%%r1,%[y]),3\n\t" - "vst %%v11,48(%%r1,%[y]),3\n\t" - "vst %%v16,64(%%r1,%[y]),3\n\t" - "vst %%v17,80(%%r1,%[y]),3\n\t" - "vst %%v18,96(%%r1,%[y]),3\n\t" - "vst %%v19,112(%%r1,%[y]),3\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 8cfbaadb8..7a67ef734 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -41,14 +41,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "0:\n\t" "pfd 1, 1024(%%r1,%[x])\n\t" "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" @@ -61,14 +61,14 @@ static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x]),3\n\t" - "vl %%v17, 80(%%r1,%[x]),3\n\t" - "vl %%v18, 96(%%r1,%[x]),3\n\t" - "vl %%v19, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 64(%%r1,%[y]),3\n\t" - "vl %%v1, 80(%%r1,%[y]),3\n\t" - "vl %%v2, 96(%%r1,%[y]),3\n\t" - "vl %%v3, 112(%%r1,%[y]),3\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" "vpdi %%v20,%%v16,%%v16,4\n\t" "vpdi %%v21,%%v17,%%v17,4\n\t" "vpdi %%v22,%%v18,%%v18,4\n\t" diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 4b64fc8a5..7f21985ec 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,10 +30,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" - "vl %%v18,32(%[x]),3\n\t" - "vl %%v19,48(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v20,8(%[x]),0\n\t" "wflcdb %%v20,%%v20\n\t" @@ -69,8 +69,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v24,0(%%r1,%[ap0])\n\t" "vlrepg %%v25,8(%%r1,%[ap0])\n\t" "vlrepg %%v26,0(%%r1,%[ap1])\n\t" @@ -103,8 +103,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -119,8 +119,8 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" - "vl %%v17,16(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v18,8(%[x]),0\n\t" "wflcdb %%v18,%%v18\n\t" @@ -142,8 +142,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v20,0(%%r1,%[ap0])\n\t" "vlrepg %%v21,8(%%r1,%[ap0])\n\t" "vlrepg %%v22,0(%%r1,%[ap1])\n\t" @@ -160,8 +160,8 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -173,7 +173,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x]),3\n\t" + __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v17,8(%[x]),0\n\t" "wflcdb %%v17,%%v17\n\t" @@ -188,8 +188,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y]),3\n\t" - "vl %%v1,16(%%r1,%[y]),3\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" "vlrepg %%v18,0(%%r1,%[ap])\n\t" "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vlrepg %%v20,16(%%r1,%[ap])\n\t" @@ -198,8 +198,8 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y]),3\n\t" - "vst %%v1,16(%%r1,%[y]),3\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" "agfi %%r1,32\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) @@ -227,14 +227,14 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "0:\n\t" "pfd 1,1024(%%r1,%[src])\n\t" "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src]),3\n\t" - "vl %%v17,16(%%r1,%[src]),3\n\t" - "vl %%v18,32(%%r1,%[src]),3\n\t" - "vl %%v19,48(%%r1,%[src]),3\n\t" - "vl %%v20,0(%%r1,%[dest]),3\n\t" - "vl %%v21,16(%%r1,%[dest]),3\n\t" - "vl %%v22,32(%%r1,%[dest]),3\n\t" - "vl %%v23,48(%%r1,%[dest]),3\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -247,10 +247,10 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest]),3\n\t" - "vst %%v29,16(%%r1,%[dest]),3\n\t" - "vst %%v30,32(%%r1,%[dest]),3\n\t" - "vst %%v31,48(%%r1,%[dest]),3\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" "agfi %%r1,64\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 429824dcf..7b3e6c1fc 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap2])\n\t" "pfd 1,1024(%%r1,%[ap3])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -73,7 +73,7 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -120,10 +120,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v24,0(%[alpha]),0\n\t" "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y]),3\n\t" - "vl %%v27,16(%[y]),3\n\t" - "vl %%v28,32(%[y]),3\n\t" - "vl %%v29,48(%[y]),3\n\t" + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" @@ -132,10 +132,10 @@ static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y]),3\n\t" - "vst %%v27,16(%[y]),3\n\t" - "vst %%v28,32(%[y]),3\n\t" - "vst %%v29,48(%[y]),3" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -160,7 +160,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "pfd 1,1024(%%r1,%[ap0])\n\t" "pfd 1,1024(%%r1,%[ap1])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -178,7 +178,7 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -213,14 +213,14 @@ static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vleg %%v20,0(%[alpha]),0\n\t" "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y]),3\n\t" - "vl %%v23,16(%[y]),3\n\t" + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y]),3\n\t" - "vst %%v23,16(%[y]),3\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), @@ -239,7 +239,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "0:\n\t" "pfd 1,1024(%%r1,%[ap])\n\t" "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x]),3\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,8(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -253,7 +253,7 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vlrepg %%v19,8(%%r1,%[ap])\n\t" "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x]),3\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) "vleg %%v1,24(%%r1,%[x]),0\n\t" "wflcdb %%v1,%%v1\n\t" @@ -282,10 +282,10 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vleg %%v18,0(%[alpha]),0\n\t" "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y]),3\n\t" + "vl %%v0,0(%[y])\n\t" "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y]),3\n\t" + "vst %%v0,0(%[y])\n\t" : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index ea81e4741..aa7f16605 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -35,14 +35,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x]),3\n\t" - "vl %%v25, 16(%%r1,%[x]),3\n\t" - "vl %%v26, 32(%%r1,%[x]),3\n\t" - "vl %%v27, 48(%%r1,%[x]),3\n\t" - "vl %%v16, 0(%%r1,%[y]),3\n\t" - "vl %%v17, 16(%%r1,%[y]),3\n\t" - "vl %%v18, 32(%%r1,%[y]),3\n\t" - "vl %%v19, 48(%%r1,%[y]),3\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -60,22 +60,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x]),3\n\t" - "vst %%v29, 16(%%r1,%[x]),3\n\t" - "vst %%v30, 32(%%r1,%[x]),3\n\t" - "vst %%v31, 48(%%r1,%[x]),3\n\t" - "vst %%v20, 0(%%r1,%[y]),3\n\t" - "vst %%v21, 16(%%r1,%[y]),3\n\t" - "vst %%v22, 32(%%r1,%[y]),3\n\t" - "vst %%v23, 48(%%r1,%[y]),3\n\t" - "vl %%v24, 64(%%r1,%[x]),3\n\t" - "vl %%v25, 80(%%r1,%[x]),3\n\t" - "vl %%v26, 96(%%r1,%[x]),3\n\t" - "vl %%v27, 112(%%r1,%[x]),3\n\t" - "vl %%v16, 64(%%r1,%[y]),3\n\t" - "vl %%v17, 80(%%r1,%[y]),3\n\t" - "vl %%v18, 96(%%r1,%[y]),3\n\t" - "vl %%v19, 112(%%r1,%[y]),3\n\t" + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -93,22 +93,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x]),3\n\t" - "vst %%v29, 80(%%r1,%[x]),3\n\t" - "vst %%v30, 96(%%r1,%[x]),3\n\t" - "vst %%v31, 112(%%r1,%[x]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v16, 128(%%r1,%[y]),3\n\t" - "vl %%v17, 144(%%r1,%[y]),3\n\t" - "vl %%v18, 160(%%r1,%[y]),3\n\t" - "vl %%v19, 176(%%r1,%[y]),3\n\t" + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -126,22 +126,22 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x]),3\n\t" - "vst %%v29, 144(%%r1,%[x]),3\n\t" - "vst %%v30, 160(%%r1,%[x]),3\n\t" - "vst %%v31, 176(%%r1,%[x]),3\n\t" - "vst %%v20, 128(%%r1,%[y]),3\n\t" - "vst %%v21, 144(%%r1,%[y]),3\n\t" - "vst %%v22, 160(%%r1,%[y]),3\n\t" - "vst %%v23, 176(%%r1,%[y]),3\n\t" - "vl %%v24, 192(%%r1,%[x]),3\n\t" - "vl %%v25, 208(%%r1,%[x]),3\n\t" - "vl %%v26, 224(%%r1,%[x]),3\n\t" - "vl %%v27, 240(%%r1,%[x]),3\n\t" - "vl %%v16, 192(%%r1,%[y]),3\n\t" - "vl %%v17, 208(%%r1,%[y]),3\n\t" - "vl %%v18, 224(%%r1,%[y]),3\n\t" - "vl %%v19, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" "vfmdb %%v28,%%v24,%%v0\n\t" "vfmdb %%v29,%%v25,%%v0\n\t" "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ @@ -159,14 +159,14 @@ static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x]),3\n\t" - "vst %%v29, 208(%%r1,%[x]),3\n\t" - "vst %%v30, 224(%%r1,%[x]),3\n\t" - "vst %%v31, 240(%%r1,%[x]),3\n\t" - "vst %%v20, 192(%%r1,%[y]),3\n\t" - "vst %%v21, 208(%%r1,%[y]),3\n\t" - "vst %%v22, 224(%%r1,%[y]),3\n\t" - "vst %%v23, 240(%%r1,%[y]),3\n\t" + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index 7fd62a1ac..fbcc0c5b9 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -36,14 +36,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v24,%%v16,%%v16,4\n\t" "vpdi %%v25,%%v17,%%v17,4\n\t" "vpdi %%v26,%%v18,%%v18,4\n\t" @@ -68,14 +68,14 @@ static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -93,14 +93,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vpdi %%v16,%%v16,%%v16,4\n\t" "vpdi %%v17,%%v17,%%v17,4\n\t" "vpdi %%v18,%%v18,%%v18,4\n\t" @@ -117,14 +117,14 @@ static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -139,14 +139,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x]),3\n\t" - "vl %%v17,16(%%r1,%[x]),3\n\t" - "vl %%v18,32(%%r1,%[x]),3\n\t" - "vl %%v19,48(%%r1,%[x]),3\n\t" - "vl %%v20,64(%%r1,%[x]),3\n\t" - "vl %%v21,80(%%r1,%[x]),3\n\t" - "vl %%v22,96(%%r1,%[x]),3\n\t" - "vl %%v23,112(%%r1,%[x]),3\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" "vfmdb %%v16,%%v16,%%v0\n\t" "vfmdb %%v17,%%v17,%%v0\n\t" "vfmdb %%v18,%%v18,%%v0\n\t" @@ -155,14 +155,14 @@ static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { "vfmdb %%v21,%%v21,%%v0\n\t" "vfmdb %%v22,%%v22,%%v0\n\t" "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x]),3\n\t" - "vst %%v17,16(%%r1,%[x]),3\n\t" - "vst %%v18,32(%%r1,%[x]),3\n\t" - "vst %%v19,48(%%r1,%[x]),3\n\t" - "vst %%v20,64(%%r1,%[x]),3\n\t" - "vst %%v21,80(%%r1,%[x]),3\n\t" - "vst %%v22,96(%%r1,%[x]),3\n\t" - "vst %%v23,112(%%r1,%[x]),3\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) @@ -177,14 +177,14 @@ static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { "xgr %%r1,%%r1\n\t" "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x]),3\n\t" - "vst %%v0,16(%%r1,%[x]),3\n\t" - "vst %%v0,32(%%r1,%[x]),3\n\t" - "vst %%v0,48(%%r1,%[x]),3\n\t" - "vst %%v0,64(%%r1,%[x]),3\n\t" - "vst %%v0,80(%%r1,%[x]),3\n\t" - "vst %%v0,96(%%r1,%[x]),3\n\t" - "vst %%v0,112(%%r1,%[x]),3\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" "agfi %%r1,128\n\t" "brctg %[n],0b" : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0252ab8db..0f38103be 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -33,70 +33,70 @@ static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { "0:\n\t" "pfd 2, 1024(%%r1,%[x])\n\t" "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x]),3\n\t" - "vl %%v17, 16(%%r1,%[x]),3\n\t" - "vl %%v18, 32(%%r1,%[x]),3\n\t" - "vl %%v19, 48(%%r1,%[x]),3\n\t" - "vl %%v20, 64(%%r1,%[x]),3\n\t" - "vl %%v21, 80(%%r1,%[x]),3\n\t" - "vl %%v22, 96(%%r1,%[x]),3\n\t" - "vl %%v23, 112(%%r1,%[x]),3\n\t" - "vl %%v24, 128(%%r1,%[x]),3\n\t" - "vl %%v25, 144(%%r1,%[x]),3\n\t" - "vl %%v26, 160(%%r1,%[x]),3\n\t" - "vl %%v27, 176(%%r1,%[x]),3\n\t" - "vl %%v28, 192(%%r1,%[x]),3\n\t" - "vl %%v29, 208(%%r1,%[x]),3\n\t" - "vl %%v30, 224(%%r1,%[x]),3\n\t" - "vl %%v31, 240(%%r1,%[x]),3\n\t" - "vl %%v0, 0(%%r1,%[y]),3\n\t" - "vl %%v1, 16(%%r1,%[y]),3\n\t" - "vl %%v2, 32(%%r1,%[y]),3\n\t" - "vl %%v3, 48(%%r1,%[y]),3\n\t" - "vl %%v4, 64(%%r1,%[y]),3\n\t" - "vl %%v5, 80(%%r1,%[y]),3\n\t" - "vl %%v6, 96(%%r1,%[y]),3\n\t" - "vl %%v7, 112(%%r1,%[y]),3\n\t" - "vst %%v0, 0(%%r1,%[x]),3\n\t" - "vst %%v1, 16(%%r1,%[x]),3\n\t" - "vst %%v2, 32(%%r1,%[x]),3\n\t" - "vst %%v3, 48(%%r1,%[x]),3\n\t" - "vst %%v4, 64(%%r1,%[x]),3\n\t" - "vst %%v5, 80(%%r1,%[x]),3\n\t" - "vst %%v6, 96(%%r1,%[x]),3\n\t" - "vst %%v7, 112(%%r1,%[x]),3\n\t" - "vl %%v0, 128(%%r1,%[y]),3\n\t" - "vl %%v1, 144(%%r1,%[y]),3\n\t" - "vl %%v2, 160(%%r1,%[y]),3\n\t" - "vl %%v3, 176(%%r1,%[y]),3\n\t" - "vl %%v4, 192(%%r1,%[y]),3\n\t" - "vl %%v5, 208(%%r1,%[y]),3\n\t" - "vl %%v6, 224(%%r1,%[y]),3\n\t" - "vl %%v7, 240(%%r1,%[y]),3\n\t" - "vst %%v0, 128(%%r1,%[x]),3\n\t" - "vst %%v1, 144(%%r1,%[x]),3\n\t" - "vst %%v2, 160(%%r1,%[x]),3\n\t" - "vst %%v3, 176(%%r1,%[x]),3\n\t" - "vst %%v4, 192(%%r1,%[x]),3\n\t" - "vst %%v5, 208(%%r1,%[x]),3\n\t" - "vst %%v6, 224(%%r1,%[x]),3\n\t" - "vst %%v7, 240(%%r1,%[x]),3\n\t" - "vst %%v16, 0(%%r1,%[y]),3\n\t" - "vst %%v17, 16(%%r1,%[y]),3\n\t" - "vst %%v18, 32(%%r1,%[y]),3\n\t" - "vst %%v19, 48(%%r1,%[y]),3\n\t" - "vst %%v20, 64(%%r1,%[y]),3\n\t" - "vst %%v21, 80(%%r1,%[y]),3\n\t" - "vst %%v22, 96(%%r1,%[y]),3\n\t" - "vst %%v23, 112(%%r1,%[y]),3\n\t" - "vst %%v24, 128(%%r1,%[y]),3\n\t" - "vst %%v25, 144(%%r1,%[y]),3\n\t" - "vst %%v26, 160(%%r1,%[y]),3\n\t" - "vst %%v27, 176(%%r1,%[y]),3\n\t" - "vst %%v28, 192(%%r1,%[y]),3\n\t" - "vst %%v29, 208(%%r1,%[y]),3\n\t" - "vst %%v30, 224(%%r1,%[y]),3\n\t" - "vst %%v31, 240(%%r1,%[y]),3\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" "agfi %%r1,256\n\t" "brctg %[n],0b" : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) From 69edc5bbe79af88710666aa909e7b39c89558b9c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 7 Feb 2019 20:06:13 +0100 Subject: [PATCH 073/133] Restore dropped patches in the non-TLS branch of memory.c (#2004) * Restore dropped patches in the non-TLS branch of memory.c As discovered in #2002, the reintroduction of the "original" non-TLS version of memory.c as an alternate branch had inadvertently used ba1f91f rather than a8002e2 , thereby dropping the commits for #1450, #1468, #1501, #1504 and #1520. --- driver/others/memory.c | 77 ++++++++++++++++++++++++++++++------------ 1 file changed, 55 insertions(+), 22 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 72d3e173c..2e185593e 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1603,9 +1603,11 @@ void gotoblas_dummy_for_PGI(void) { #endif #else +/* USE_TLS / COMPILE_TLS not set */ + #include -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 @@ -1619,7 +1621,7 @@ void gotoblas_dummy_for_PGI(void) { #include #include -#ifndef OS_WINDOWS +#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) #include #ifndef NO_SYSV_IPC #include @@ -1639,7 +1641,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) #include #include #endif @@ -1678,9 +1680,12 @@ void gotoblas_dummy_for_PGI(void) { #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#else +#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) +#else +#define CONSTRUCTOR __attribute__ ((constructor)) +#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1740,7 +1745,8 @@ int i,n; size = CPU_ALLOC_SIZE(nums); ret = sched_getaffinity(0,size,cpusetp); if (ret!=0) return nums; - nums = CPU_COUNT_S(size,cpusetp); + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; CPU_FREE(cpusetp); return nums; #endif @@ -1756,7 +1762,7 @@ int get_num_procs(void) { return nums; } #endif - + #ifdef OS_HAIKU int get_num_procs(void) { static int nums = 0; @@ -1793,7 +1799,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) +#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) int get_num_procs(void) { @@ -1870,7 +1876,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) +#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -1883,7 +1889,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1891,11 +1897,11 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif - blas_goto_num = 0; + // blas_goto_num = 0; #ifndef USE_OPENMP blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -1907,7 +1913,7 @@ int blas_get_cpu_number(void){ #endif - blas_omp_num = 0; + // blas_omp_num = 0; blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; @@ -1915,7 +1921,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2002,11 +2008,15 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif } #ifdef OS_LINUX @@ -2148,14 +2158,18 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif - LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { +#if defined(SMP) && !defined(USE_OPENMP) + LOCK_COMMAND(&alloc_lock); +#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif } - UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -2554,6 +2568,11 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); + +#if defined(USE_OPENMP) + if (!memory_initialized) { +#endif + LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { @@ -2589,6 +2608,9 @@ void *blas_memory_alloc(int procpos){ } UNLOCK_COMMAND(&alloc_lock); +#if defined(USE_OPENMP) + } +#endif #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2603,13 +2625,17 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -// blas_lock(&memory[position].lock); - +#else + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -// blas_unlock(&memory[position].lock); +#else + blas_unlock(&memory[position].lock); +#endif } position ++; @@ -2647,7 +2673,6 @@ void *blas_memory_alloc(int procpos){ memory[position].used = 1; UNLOCK_COMMAND(&alloc_lock); -/* blas_unlock(&memory[position].lock);*/ if (!memory[position].addr) { do { @@ -2693,9 +2718,13 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif memory[position].addr = map_address; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -2749,8 +2778,9 @@ void blas_memory_free(void *free_area){ #endif position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); - +#endif while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; @@ -2764,7 +2794,9 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); +#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -2779,8 +2811,9 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#endif return; } From 03a2bf2602714360fdf7096a4fc362ecfc700823 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 10 Feb 2019 23:24:45 +0100 Subject: [PATCH 074/133] Fix potential memory leak in cpu enumeration on Linux (#2008) * Fix potential memory leak in cpu enumeration with glibc An early return after a failed call to sched_getaffinity would leak the previously allocated cpu_set_t. Wrong calculation of the size argument in that call increased the likelyhood of that failure. Fixes #2003 --- driver/others/memory.c | 123 ++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 38 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e..09851f15c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -198,45 +198,68 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } @@ -1709,46 +1732,70 @@ void goto_set_num_threads(int num_threads) {}; int get_num_procs(void); #else int get_num_procs(void) { + static int nums = 0; -cpu_set_t *cpusetp; -size_t size; -int ret; -int i,n; + cpu_set_t cpuset,*cpusetp; + size_t size; + int ret; + +#if defined(__GLIBC_PREREQ) +#if !__GLIBC_PREREQ(2, 7) + int i; +#if !__GLIBC_PREREQ(2, 6) + int n; +#endif +#endif +#endif if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); + ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; + if (nums >= CPU_SETSIZE) { + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) { + return nums; + } + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) { + CPU_FREE(cpusetp); + return nums; + } + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; + } else { + ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); + if (ret!=0) { + return nums; + } + ret = CPU_COUNT(&cpuset); + if (ret > 0 && ret < nums) nums = ret; + return nums; + } #endif #endif } From 77fe70019f0fb4064eec2a5b26a6057acef29b58 Mon Sep 17 00:00:00 2001 From: maamountki Date: Mon, 11 Feb 2019 16:01:13 +0200 Subject: [PATCH 075/133] [ZARCH] Fix constraints and source code formatting --- kernel/zarch/camax.c | 212 +++++------ kernel/zarch/camin.c | 212 +++++------ kernel/zarch/casum.c | 154 ++++---- kernel/zarch/caxpy.c | 130 +++---- kernel/zarch/ccopy.c | 21 +- kernel/zarch/cdot.c | 148 ++++---- kernel/zarch/cgemv_n_4.c | 590 ++++++++++++++--------------- kernel/zarch/cgemv_t_4.c | 52 +-- kernel/zarch/crot.c | 291 +++++++-------- kernel/zarch/cscal.c | 309 ++++++++-------- kernel/zarch/cswap.c | 151 ++++---- kernel/zarch/damax.c | 90 ++--- kernel/zarch/damax_z13.c | 158 ++++---- kernel/zarch/damin.c | 90 ++--- kernel/zarch/damin_z13.c | 158 ++++---- kernel/zarch/dasum.c | 150 ++++---- kernel/zarch/daxpy.c | 152 ++++---- kernel/zarch/dcopy.c | 20 +- kernel/zarch/ddot.c | 108 +++--- kernel/zarch/dgemv_n_4.c | 624 ++++++++++++++++--------------- kernel/zarch/dgemv_t_4.c | 780 ++++++++++++++++++++------------------- kernel/zarch/dmax.c | 90 ++--- kernel/zarch/dmax_z13.c | 124 +++---- kernel/zarch/dmin.c | 90 ++--- kernel/zarch/dmin_z13.c | 124 +++---- kernel/zarch/drot.c | 291 +++++++-------- kernel/zarch/dscal.c | 102 ++--- kernel/zarch/dsdot.c | 171 ++++----- kernel/zarch/dswap.c | 151 ++++---- kernel/zarch/icamax.c | 370 +++++++++---------- kernel/zarch/icamin.c | 370 +++++++++---------- kernel/zarch/idamax.c | 264 ++++++------- kernel/zarch/idamin.c | 264 ++++++------- kernel/zarch/idmax.c | 230 ++++++------ kernel/zarch/idmin.c | 230 ++++++------ kernel/zarch/isamax.c | 352 +++++++++--------- kernel/zarch/isamin.c | 352 +++++++++--------- kernel/zarch/ismax.c | 318 ++++++++-------- kernel/zarch/ismin.c | 318 ++++++++-------- kernel/zarch/izamax.c | 256 ++++++------- kernel/zarch/izamin.c | 256 ++++++------- kernel/zarch/samax.c | 94 ++--- kernel/zarch/samin.c | 94 ++--- kernel/zarch/sasum.c | 154 ++++---- kernel/zarch/saxpy.c | 152 ++++---- kernel/zarch/scopy.c | 20 +- kernel/zarch/sdot.c | 116 +++--- kernel/zarch/sgemv_n_4.c | 584 +++++++++++++++-------------- kernel/zarch/sgemv_t_4.c | 766 +++++++++++++++++++------------------- kernel/zarch/smax.c | 94 ++--- kernel/zarch/smin.c | 94 ++--- kernel/zarch/srot.c | 291 +++++++-------- kernel/zarch/sscal.c | 102 ++--- kernel/zarch/sswap.c | 151 ++++---- kernel/zarch/zamax.c | 166 ++++----- kernel/zarch/zamax_z13.c | 184 ++++----- kernel/zarch/zamin.c | 166 ++++----- kernel/zarch/zamin_z13.c | 184 ++++----- kernel/zarch/zasum.c | 150 ++++---- kernel/zarch/zaxpy.c | 138 +++---- kernel/zarch/zcopy.c | 21 +- kernel/zarch/zdot.c | 140 +++---- kernel/zarch/zgemv_n_4.c | 414 +++++++++++---------- kernel/zarch/zgemv_t_4.c | 452 ++++++++++++----------- kernel/zarch/zrot.c | 291 +++++++-------- kernel/zarch/zscal.c | 301 +++++++-------- kernel/zarch/zswap.c | 151 ++++---- 67 files changed, 7439 insertions(+), 7354 deletions(-) diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c index 40a9903e9..b10ca4752 100644 --- a/kernel/zarch/camax.c +++ b/kernel/zarch/camax.c @@ -34,112 +34,112 @@ static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amax; } diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c index 842635afc..40945fae8 100644 --- a/kernel/zarch/camin.c +++ b/kernel/zarch/camin.c @@ -34,112 +34,112 @@ static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v16,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v16,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v16,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v16,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v16,%%v16\n\t" + "vfasb %%v0,%%v0,%%v16\n\t" + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,8,4\n\t" + "vleib %%v1,9,5\n\t" + "vleib %%v1,10,6\n\t" + "vleib %%v1,11,7\n\t" + "vleib %%v1,16,8\n\t" + "vleib %%v1,17,9\n\t" + "vleib %%v1,18,10\n\t" + "vleib %%v1,19,11\n\t" + "vleib %%v1,24,12\n\t" + "vleib %%v1,25,13\n\t" + "vleib %%v1,26,14\n\t" + "vleib %%v1,27,15\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v2,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v2\n\t" + "vperm %%v16,%%v16,%%v2,%%v1\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v2,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v2,%%v1\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v2,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v2,%%v1\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v2,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v2,%%v1\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v2,144(%%r1,%[x])\n\t" + "vpkg %%v25,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v2,%%v1\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v2,176(%%r1,%[x])\n\t" + "vpkg %%v27,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v2,%%v1\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v2,208(%%r1,%[x])\n\t" + "vpkg %%v29,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v2,%%v1\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v2,240(%%r1,%[x])\n\t" + "vpkg %%v31,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v2,%%v1\n\t" + "vflpsb %%v16,%%v16\n\t" + "vflpsb %%v17,%%v17\n\t" + "vflpsb %%v18,%%v18\n\t" + "vflpsb %%v19,%%v19\n\t" + "vflpsb %%v20,%%v20\n\t" + "vflpsb %%v21,%%v21\n\t" + "vflpsb %%v22,%%v22\n\t" + "vflpsb %%v23,%%v23\n\t" + "vflpsb %%v24,%%v24\n\t" + "vflpsb %%v25,%%v25\n\t" + "vflpsb %%v26,%%v26\n\t" + "vflpsb %%v27,%%v27\n\t" + "vflpsb %%v28,%%v28\n\t" + "vflpsb %%v29,%%v29\n\t" + "vflpsb %%v30,%%v30\n\t" + "vflpsb %%v31,%%v31\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v18,%%v18,%%v19\n\t" + "vfasb %%v20,%%v20,%%v21\n\t" + "vfasb %%v22,%%v22,%%v23\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v26,%%v26,%%v27\n\t" + "vfasb %%v28,%%v28,%%v29\n\t" + "vfasb %%v30,%%v30,%%v31\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); return amin; } diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c index f59e5a20b..e28f2018c 100644 --- a/kernel/zarch/casum.c +++ b/kernel/zarch/casum.c @@ -34,83 +34,83 @@ static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index d86342bd0..e4b484ab7 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -30,73 +30,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" + "vlrepf %%v0,0(%[alpha])\n\t" + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" #else - "vlef %%v0,0(%[alpha]),1\n\t" - "vlef %%v0,0(%[alpha]),3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,0(%[alpha]),0\n\t" - "vlef %%v0,0(%[alpha]),2\n\t" - "vlrepf %%v1,4(%[alpha])\n\t" + "vlef %%v0,0(%[alpha]),1\n\t" + "vlef %%v0,0(%[alpha]),3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,0(%[alpha]),0\n\t" + "vlef %%v0,0(%[alpha]),2\n\t" + "vlrepf %%v1,4(%[alpha])\n\t" #endif - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c index d17bddcc8..0a5e03992 100644 --- a/kernel/zarch/ccopy.c +++ b/kernel/zarch/ccopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c index 64d81ae5c..d90f9c871 100644 --- a/kernel/zarch/cdot.c +++ b/kernel/zarch/cdot.c @@ -29,80 +29,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vrepg %%v26,%%v24,1\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vfasb %%v25,%%v25,%%v29\n\t" - "vfasb %%v25,%%v25,%%v31\n\t" - "vrepg %%v27,%%v25,1\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vstef %%v24,0(%[d]),0\n\t" - "vstef %%v24,4(%[d]),1\n\t" - "vstef %%v25,8(%[d]),1\n\t" - "vstef %%v25,12(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "verllg %%v22,%%v18,32\n\t" + "verllg %%v23,%%v19,32\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vrepg %%v26,%%v24,1\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vfasb %%v25,%%v25,%%v29\n\t" + "vfasb %%v25,%%v25,%%v31\n\t" + "vrepg %%v27,%%v25,1\n\t" + "vfasb %%v25,%%v25,%%v27\n\t" + "vstef %%v24,0(%[d]),0\n\t" + "vstef %%v24,4(%[d]),1\n\t" + "vstef %%v25,8(%[d]),1\n\t" + "vstef %%v25,12(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index db91d9063..adba05d47 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -30,323 +30,331 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" - "vlrepg %%v18,16(%[x])\n\t" - "vlrepg %%v19,24(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v18,16(%[x])\n\t" + "vlrepg %%v19,24(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" #else - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" + "vlef %%v20,0(%[x]),1\n\t" + "vlef %%v20,0(%[x]),3\n\t" + "vflcsb %%v20,%%v20\n\t" + "vlef %%v20,4(%[x]),0\n\t" + "vlef %%v20,4(%[x]),2\n\t" + "vlef %%v21,8(%[x]),1\n\t" + "vlef %%v21,8(%[x]),3\n\t" + "vflcsb %%v21,%%v21\n\t" + "vlef %%v21,12(%[x]),0\n\t" + "vlef %%v21,12(%[x]),2\n\t" + "vlef %%v22,16(%[x]),1\n\t" + "vlef %%v22,16(%[x]),3\n\t" + "vflcsb %%v22,%%v22\n\t" + "vlef %%v22,20(%[x]),0\n\t" + "vlef %%v22,20(%[x]),2\n\t" + "vlef %%v23,24(%[x]),1\n\t" + "vlef %%v23,24(%[x]),3\n\t" + "vflcsb %%v23,%%v23\n\t" + "vlef %%v23,28(%[x]),0\n\t" + "vlef %%v23,28(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vperm %%v25,%%v24,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v24,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap1])\n\t" - "vperm %%v27,%%v26,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v26,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" - "vl %%v28,0(%%r1,%[ap2])\n\t" - "vperm %%v29,%%v28,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v28,%%v1\n\t" - "vl %%v30,0(%%r1,%[ap3])\n\t" - "vperm %%v31,%%v30,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v30,%%v1\n\t" - "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vperm %%v25,%%v24,%%v24,%%v2\n\t" + "vperm %%v24,%%v24,%%v24,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap1])\n\t" + "vperm %%v27,%%v26,%%v26,%%v2\n\t" + "vperm %%v26,%%v26,%%v26,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" + "vl %%v28,0(%%r1,%[ap2])\n\t" + "vperm %%v29,%%v28,%%v28,%%v2\n\t" + "vperm %%v28,%%v28,%%v28,%%v1\n\t" + "vl %%v30,0(%%r1,%[ap3])\n\t" + "vperm %%v31,%%v30,%%v30,%%v2\n\t" + "vperm %%v30,%%v30,%%v30,%%v1\n\t" + "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" + "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" + "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" + "vlrepg %%v17,8(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" #else - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" + "vlef %%v18,0(%[x]),1\n\t" + "vlef %%v18,0(%[x]),3\n\t" + "vflcsb %%v18,%%v18\n\t" + "vlef %%v18,4(%[x]),0\n\t" + "vlef %%v18,4(%[x]),2\n\t" + "vlef %%v19,8(%[x]),1\n\t" + "vlef %%v19,8(%[x]),3\n\t" + "vflcsb %%v19,%%v19\n\t" + "vlef %%v19,12(%[x]),0\n\t" + "vlef %%v19,12(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v20,0(%%r1,%[ap0])\n\t" - "vperm %%v21,%%v20,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v20,%%v1\n\t" - "vl %%v22,0(%%r1,%[ap1])\n\t" - "vperm %%v23,%%v22,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v22,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v20,0(%%r1,%[ap0])\n\t" + "vperm %%v21,%%v20,%%v20,%%v2\n\t" + "vperm %%v20,%%v20,%%v20,%%v1\n\t" + "vl %%v22,0(%%r1,%[ap1])\n\t" + "vperm %%v23,%%v22,%%v22,%%v2\n\t" + "vperm %%v22,%%v22,%%v22,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vlrepg %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" #else - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" + "vlef %%v17,0(%[x]),1\n\t" + "vlef %%v17,0(%[x]),3\n\t" + "vflcsb %%v17,%%v17\n\t" + "vlef %%v17,4(%[x]),0\n\t" + "vlef %%v17,4(%[x]),2\n\t" #endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v18,0(%%r1,%[ap])\n\t" - "vperm %%v19,%%v18,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v18,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); + "vleib %%v1,0,0\n\t" + "vleib %%v1,1,1\n\t" + "vleib %%v1,2,2\n\t" + "vleib %%v1,3,3\n\t" + "vleib %%v1,0,4\n\t" + "vleib %%v1,1,5\n\t" + "vleib %%v1,2,6\n\t" + "vleib %%v1,3,7\n\t" + "vleib %%v1,8,8\n\t" + "vleib %%v1,9,9\n\t" + "vleib %%v1,10,10\n\t" + "vleib %%v1,11,11\n\t" + "vleib %%v1,8,12\n\t" + "vleib %%v1,9,13\n\t" + "vleib %%v1,10,14\n\t" + "vleib %%v1,11,15\n\t" + "vleib %%v2,4,0\n\t" + "vleib %%v2,5,1\n\t" + "vleib %%v2,6,2\n\t" + "vleib %%v2,7,3\n\t" + "vleib %%v2,4,4\n\t" + "vleib %%v2,5,5\n\t" + "vleib %%v2,6,6\n\t" + "vleib %%v2,7,7\n\t" + "vleib %%v2,12,8\n\t" + "vleib %%v2,13,9\n\t" + "vleib %%v2,14,10\n\t" + "vleib %%v2,15,11\n\t" + "vleib %%v2,12,12\n\t" + "vleib %%v2,13,13\n\t" + "vleib %%v2,14,14\n\t" + "vleib %%v2,15,15\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v18,0(%%r1,%[ap])\n\t" + "vperm %%v19,%%v18,%%v18,%%v2\n\t" + "vperm %%v18,%%v18,%%v18,%%v1\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %[n],0b\n\t" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepf %%v0,%[alpha_r]\n\t" - "vlef %%v1,%[alpha_i],0\n\t" - "vlef %%v1,%[alpha_i],2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,%[alpha_i],1\n\t" - "vlef %%v1,%[alpha_i],3\n\t" + "vlrepf %%v0,%[alpha_r]\n\t" + "vlef %%v1,%[alpha_i],0\n\t" + "vlef %%v1,%[alpha_i],2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,%[alpha_i],1\n\t" + "vlef %%v1,%[alpha_i],3\n\t" #else - "vlef %%v0,%[alpha_r],1\n\t" - "vlef %%v0,%[alpha_r],3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,%[alpha_r],0\n\t" - "vlef %%v0,%[alpha_r],2\n\t" - "vlrepf %%v1,%[alpha_i]\n\t" + "vlef %%v0,%[alpha_r],1\n\t" + "vlef %%v0,%[alpha_r],3\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,%[alpha_r],0\n\t" + "vlef %%v0,%[alpha_r],2\n\t" + "vlrepf %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,0(%%r1,%[dest])\n\t" - "vl %%v19,16(%%r1,%[dest])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" - "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" - "vst %%v22,0(%%r1,%[dest])\n\t" - "vst %%v23,16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,0(%%r1,%[dest])\n\t" + "vl %%v19,16(%%r1,%[dest])\n\t" + "verllg %%v20,%%v16,32\n\t" + "verllg %%v21,%%v17,32\n\t" + "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" + "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" + "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" + "vst %%v22,0(%%r1,%[dest])\n\t" + "vst %%v23,16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 9e65c5fb5..91ea1c10c 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -31,6 +31,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -154,20 +159,23 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" "vst %%v22,0(%[y])\n\t" "vst %%v23,16(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" "vzero %%v17\n\t" "vzero %%v18\n\t" @@ -263,13 +271,13 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" "vst %%v20,0(%[y])" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23"); } static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, @@ -353,11 +361,11 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" "vsteg %%v0,0(%[y]),0" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c index 669d78a9d..aab155f8b 100644 --- a/kernel/zarch/crot.c +++ b/kernel/zarch/crot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c index a2d5bf223..9fc54cf29 100644 --- a/kernel/zarch/cscal.c +++ b/kernel/zarch/cscal.c @@ -29,171 +29,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlef %%v1,4(%[alpha]),0\n\t" + "vlef %%v1,4(%[alpha]),2\n\t" + "vflcsb %%v1,%%v1\n\t" + "vlef %%v1,4(%[alpha]),1\n\t" + "vlef %%v1,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v24,%%v16,32\n\t" + "verllg %%v25,%%v17,32\n\t" + "verllg %%v26,%%v18,32\n\t" + "verllg %%v27,%%v19,32\n\t" + "verllg %%v28,%%v20,32\n\t" + "verllg %%v29,%%v21,32\n\t" + "verllg %%v30,%%v22,32\n\t" + "verllg %%v31,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlef %%v0,4(%[alpha]),0\n\t" - "vlef %%v0,4(%[alpha]),2\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,4(%[alpha]),1\n\t" - "vlef %%v0,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v16,%%v16,32\n\t" - "verllg %%v17,%%v17,32\n\t" - "verllg %%v18,%%v18,32\n\t" - "verllg %%v19,%%v19,32\n\t" - "verllg %%v20,%%v20,32\n\t" - "verllg %%v21,%%v21,32\n\t" - "verllg %%v22,%%v22,32\n\t" - "verllg %%v23,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "vlef %%v0,4(%[alpha]),2\n\t" + "vflcsb %%v0,%%v0\n\t" + "vlef %%v0,4(%[alpha]),1\n\t" + "vlef %%v0,4(%[alpha]),3\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "verllg %%v16,%%v16,32\n\t" + "verllg %%v17,%%v17,32\n\t" + "verllg %%v18,%%v18,32\n\t" + "verllg %%v19,%%v19,32\n\t" + "verllg %%v20,%%v20,32\n\t" + "verllg %%v21,%%v21,32\n\t" + "verllg %%v22,%%v22,32\n\t" + "verllg %%v23,%%v23,32\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmsb %%v16,%%v16,%%v0\n\t" + "vfmsb %%v17,%%v17,%%v0\n\t" + "vfmsb %%v18,%%v18,%%v0\n\t" + "vfmsb %%v19,%%v19,%%v0\n\t" + "vfmsb %%v20,%%v20,%%v0\n\t" + "vfmsb %%v21,%%v21,%%v0\n\t" + "vfmsb %%v22,%%v22,%%v0\n\t" + "vfmsb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c index 92a81591f..198994e18 100644 --- a/kernel/zarch/cswap.c +++ b/kernel/zarch/cswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c index 37008f702..caacb50dc 100644 --- a/kernel/zarch/damax.c +++ b/kernel/zarch/damax.c @@ -34,51 +34,51 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,8\n\t" - "vfmaxdb %%v17,%%v17,%%v25,8\n\t" - "vfmaxdb %%v18,%%v18,%%v26,8\n\t" - "vfmaxdb %%v19,%%v19,%%v27,8\n\t" - "vfmaxdb %%v20,%%v20,%%v28,8\n\t" - "vfmaxdb %%v21,%%v21,%%v29,8\n\t" - "vfmaxdb %%v22,%%v22,%%v30,8\n\t" - "vfmaxdb %%v23,%%v23,%%v31,8\n\t" - "vfmaxdb %%v16,%%v16,%%v20,8\n\t" - "vfmaxdb %%v17,%%v17,%%v21,8\n\t" - "vfmaxdb %%v18,%%v18,%%v22,8\n\t" - "vfmaxdb %%v19,%%v19,%%v23,8\n\t" - "vfmaxdb %%v16,%%v16,%%v18,8\n\t" - "vfmaxdb %%v17,%%v17,%%v19,8\n\t" - "vfmaxdb %%v16,%%v16,%%v17,8\n\t" - "vfmaxdb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,8\n\t" + "vfmaxdb %%v17,%%v17,%%v25,8\n\t" + "vfmaxdb %%v18,%%v18,%%v26,8\n\t" + "vfmaxdb %%v19,%%v19,%%v27,8\n\t" + "vfmaxdb %%v20,%%v20,%%v28,8\n\t" + "vfmaxdb %%v21,%%v21,%%v29,8\n\t" + "vfmaxdb %%v22,%%v22,%%v30,8\n\t" + "vfmaxdb %%v23,%%v23,%%v31,8\n\t" + "vfmaxdb %%v16,%%v16,%%v20,8\n\t" + "vfmaxdb %%v17,%%v17,%%v21,8\n\t" + "vfmaxdb %%v18,%%v18,%%v22,8\n\t" + "vfmaxdb %%v19,%%v19,%%v23,8\n\t" + "vfmaxdb %%v16,%%v16,%%v18,8\n\t" + "vfmaxdb %%v17,%%v17,%%v19,8\n\t" + "vfmaxdb %%v16,%%v16,%%v17,8\n\t" + "vfmaxdb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c index 530d6e5bb..f3db4c108 100644 --- a/kernel/zarch/damax_z13.c +++ b/kernel/zarch/damax_z13.c @@ -34,85 +34,85 @@ static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c index a01791741..0163a144b 100644 --- a/kernel/zarch/damin.c +++ b/kernel/zarch/damin.c @@ -34,51 +34,51 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,8\n\t" - "vfmindb %%v17,%%v17,%%v25,8\n\t" - "vfmindb %%v18,%%v18,%%v26,8\n\t" - "vfmindb %%v19,%%v19,%%v27,8\n\t" - "vfmindb %%v20,%%v20,%%v28,8\n\t" - "vfmindb %%v21,%%v21,%%v29,8\n\t" - "vfmindb %%v22,%%v22,%%v30,8\n\t" - "vfmindb %%v23,%%v23,%%v31,8\n\t" - "vfmindb %%v16,%%v16,%%v20,8\n\t" - "vfmindb %%v17,%%v17,%%v21,8\n\t" - "vfmindb %%v18,%%v18,%%v22,8\n\t" - "vfmindb %%v19,%%v19,%%v23,8\n\t" - "vfmindb %%v16,%%v16,%%v18,8\n\t" - "vfmindb %%v17,%%v17,%%v19,8\n\t" - "vfmindb %%v16,%%v16,%%v17,8\n\t" - "vfmindb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,8\n\t" + "vfmindb %%v17,%%v17,%%v25,8\n\t" + "vfmindb %%v18,%%v18,%%v26,8\n\t" + "vfmindb %%v19,%%v19,%%v27,8\n\t" + "vfmindb %%v20,%%v20,%%v28,8\n\t" + "vfmindb %%v21,%%v21,%%v29,8\n\t" + "vfmindb %%v22,%%v22,%%v30,8\n\t" + "vfmindb %%v23,%%v23,%%v31,8\n\t" + "vfmindb %%v16,%%v16,%%v20,8\n\t" + "vfmindb %%v17,%%v17,%%v21,8\n\t" + "vfmindb %%v18,%%v18,%%v22,8\n\t" + "vfmindb %%v19,%%v19,%%v23,8\n\t" + "vfmindb %%v16,%%v16,%%v18,8\n\t" + "vfmindb %%v17,%%v17,%%v19,8\n\t" + "vfmindb %%v16,%%v16,%%v17,8\n\t" + "vfmindb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,8\n\t" + "lpdr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c index 2172b6d6f..4196b2e15 100644 --- a/kernel/zarch/damin_z13.c +++ b/kernel/zarch/damin_z13.c @@ -34,85 +34,85 @@ static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index 9f69a9931..aa1382b10 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -34,81 +34,81 @@ static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 179ef8834..5b0208c20 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,%[alpha]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dcopy.c b/kernel/zarch/dcopy.c index b6a740c43..691b90c64 100644 --- a/kernel/zarch/dcopy.c +++ b/kernel/zarch/dcopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dcopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index f5f601717..9cad68f4b 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -31,60 +31,60 @@ static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index c93ff9b54..502ba837e 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -31,324 +31,334 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,16(%[x])\n\t" - "vlrepg %%v3,24(%[x])\n\t" - "vlrepg %%v4,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v4\n\t" - "vfmdb %%v1,%%v1,%%v4\n\t" - "vfmdb %%v2,%%v2,%%v4\n\t" - "vfmdb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,16(%[x])\n\t" + "vlrepg %%v3,24(%[x])\n\t" + "vlrepg %%v4,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v4\n\t" + "vfmdb %%v1,%%v1,%%v4\n\t" + "vfmdb %%v2,%%v2,%%v4\n\t" + "vfmdb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v2\n\t" - "vfmdb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepg %%v1,8(%[x])\n\t" + "vlrepg %%v2,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v2\n\t" + "vfmdb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v16,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" - "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" - "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v16,%[alpha]\n\t" + "vfmdb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,0(%%r1,%[y])\n\t" + "vl %%v19,16(%%r1,%[y])\n\t" + "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" + "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" + "vst %%v18,0(%%r1,%[y])\n\t" + "vst %%v19,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index 24680cf1b..de72a1798 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -30,333 +30,341 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v2,%%v2,%%v6\n\t" - "vfadb %%v3,%%v3,%%v7\n\t" - "vrepg %%v4,%%v0,1\n\t" - "adbr %%f0,%%f4\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v4,%%v1,1\n\t" - "adbr %%f1,%%f4\n\t" - "std %%f1,8(%[y])\n\t" - "vrepg %%v4,%%v2,1\n\t" - "adbr %%f2,%%f4\n\t" - "std %%f2,16(%[y])\n\t" - "vrepg %%v4,%%v3,1\n\t" - "adbr %%f3,%%f4\n\t" - "std %%f3,24(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v2,%%v2,%%v6\n\t" + "vfadb %%v3,%%v3,%%v7\n\t" + "vrepg %%v4,%%v0,1\n\t" + "adbr %%f0,%%f4\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v4,%%v1,1\n\t" + "adbr %%f1,%%f4\n\t" + "std %%f1,8(%[y])\n\t" + "vrepg %%v4,%%v2,1\n\t" + "adbr %%f2,%%f4\n\t" + "std %%f2,16(%[y])\n\t" + "vrepg %%v4,%%v3,1\n\t" + "adbr %%f3,%%f4\n\t" + "std %%f3,24(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v1,%%v1,%%v3\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v1,%%v1,%%v7\n\t" - "vrepg %%v2,%%v0,1\n\t" - "adbr %%f0,%%f2\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v2,%%v1,1\n\t" - "adbr %%f1,%%f2\n\t" - "std %%f1,8(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v1,%%v1,%%v3\n\t" + "vfadb %%v1,%%v1,%%v5\n\t" + "vfadb %%v1,%%v1,%%v7\n\t" + "vrepg %%v2,%%v0,1\n\t" + "adbr %%f0,%%f2\n\t" + "std %%f0,0(%[y])\n\t" + "vrepg %%v2,%%v1,1\n\t" + "adbr %%f1,%%f2\n\t" + "std %%f1,8(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "std %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "std %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -369,74 +377,74 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepg %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-16\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,4\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,12\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "agfi %%r1,32\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/dmax.c b/kernel/zarch/dmax.c index 65ed31f01..cdc8d5d08 100644 --- a/kernel/zarch/dmax.c +++ b/kernel/zarch/dmax.c @@ -31,51 +31,51 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v17,%%v17,%%v25,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v19,%%v19,%%v27,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v21,%%v21,%%v29,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v23,%%v23,%%v31,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v17,%%v17,%%v21,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v19,%%v19,%%v23,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v17,%%v17,%%v19,0\n\t" - "vfmaxdb %%v16,%%v16,%%v17,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v17,%%v17,%%v25,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v19,%%v19,%%v27,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v21,%%v21,%%v29,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v23,%%v23,%%v31,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v17,%%v17,%%v21,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v19,%%v19,%%v23,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v17,%%v17,%%v19,0\n\t" + "vfmaxdb %%v16,%%v16,%%v17,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c index 87bccbe55..c4e8d91f8 100644 --- a/kernel/zarch/dmax_z13.c +++ b/kernel/zarch/dmax_z13.c @@ -31,68 +31,68 @@ static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vfchdb %%v26,%%v20,%%v21\n\t" + "vfchdb %%v27,%%v22,%%v23\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v24,%%v25\n\t" + "vfchdb %%v29,%%v26,%%v27\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v28,%%v29\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v30,%%v0\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c index 518cc262c..f9b129cbd 100644 --- a/kernel/zarch/dmin.c +++ b/kernel/zarch/dmin.c @@ -31,51 +31,51 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v17,%%v17,%%v25,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v19,%%v19,%%v27,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v21,%%v21,%%v29,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v23,%%v23,%%v31,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v17,%%v17,%%v21,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v19,%%v19,%%v23,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v17,%%v17,%%v19,0\n\t" - "vfmindb %%v16,%%v16,%%v17,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v17,%%v17,%%v25,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v19,%%v19,%%v27,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v21,%%v21,%%v29,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v23,%%v23,%%v31,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v17,%%v17,%%v21,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v19,%%v19,%%v23,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v17,%%v17,%%v19,0\n\t" + "vfmindb %%v16,%%v16,%%v17,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c index 91561992f..77f021c1d 100644 --- a/kernel/zarch/dmin_z13.c +++ b/kernel/zarch/dmin_z13.c @@ -31,68 +31,68 @@ static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vfchdb %%v26,%%v21,%%v20\n\t" + "vfchdb %%v27,%%v23,%%v22\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vsel %%v26,%%v20,%%v21,%%v26\n\t" + "vsel %%v27,%%v22,%%v23,%%v27\n\t" + "vfchdb %%v28,%%v25,%%v24\n\t" + "vfchdb %%v29,%%v27,%%v26\n\t" + "vsel %%v28,%%v24,%%v25,%%v28\n\t" + "vsel %%v29,%%v26,%%v27,%%v29\n\t" + "vfchdb %%v30,%%v29,%%v28\n\t" + "vsel %%v30,%%v28,%%v29,%%v30\n\t" + "vfchdb %%v31,%%v0,%%v30\n\t" + "vsel %%v0,%%v30,%%v0,%%v31\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 8f0197f02..11fbe15b6 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index c944990b5..2961eff20 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepg %%v0,%[da]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmdb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmdb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmdb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmdb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmdb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmdb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmdb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmdb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c index 1ac02d4b9..5fa88c3b9 100644 --- a/kernel/zarch/dsdot.c +++ b/kernel/zarch/dsdot.c @@ -31,91 +31,92 @@ static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { double dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vlef %%v16,0(%%r1,%[x]),0\n\t" - "vlef %%v16,4(%%r1,%[x]),2\n\t" - "vlef %%v17,8(%%r1,%[x]),0\n\t" - "vlef %%v17,12(%%r1,%[x]),2\n\t" - "vlef %%v18,16(%%r1,%[x]),0\n\t" - "vlef %%v18,20(%%r1,%[x]),2\n\t" - "vlef %%v19,24(%%r1,%[x]),0\n\t" - "vlef %%v19,28(%%r1,%[x]),2\n\t" - "vlef %%v20,32(%%r1,%[x]),0\n\t" - "vlef %%v20,36(%%r1,%[x]),2\n\t" - "vlef %%v21,40(%%r1,%[x]),0\n\t" - "vlef %%v21,44(%%r1,%[x]),2\n\t" - "vlef %%v22,48(%%r1,%[x]),0\n\t" - "vlef %%v22,52(%%r1,%[x]),2\n\t" - "vlef %%v23,56(%%r1,%[x]),0\n\t" - "vlef %%v23,60(%%r1,%[x]),2\n\t" - "vflls %%v16,%%v16\n\t" - "vflls %%v17,%%v17\n\t" - "vflls %%v18,%%v18\n\t" - "vflls %%v19,%%v19\n\t" - "vflls %%v20,%%v20\n\t" - "vflls %%v21,%%v21\n\t" - "vflls %%v22,%%v22\n\t" - "vflls %%v23,%%v23\n\t" - "vlef %%v24,0(%%r1,%[y]),0\n\t" - "vlef %%v24,4(%%r1,%[y]),2\n\t" - "vflls %%v24,%%v24\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vlef %%v25,8(%%r1,%[y]),0\n\t" - "vlef %%v25,12(%%r1,%[y]),2\n\t" - "vflls %%v25,%%v25\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vlef %%v26,16(%%r1,%[y]),0\n\t" - "vlef %%v26,20(%%r1,%[y]),2\n\t" - "vflls %%v26,%%v26\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vlef %%v27,24(%%r1,%[y]),0\n\t" - "vlef %%v27,28(%%r1,%[y]),2\n\t" - "vflls %%v27,%%v27\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vlef %%v28,32(%%r1,%[y]),0\n\t" - "vlef %%v28,36(%%r1,%[y]),2\n\t" - "vflls %%v28,%%v28\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vlef %%v29,40(%%r1,%[y]),0\n\t" - "vlef %%v29,44(%%r1,%[y]),2\n\t" - "vflls %%v29,%%v29\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vlef %%v30,48(%%r1,%[y]),0\n\t" - "vlef %%v30,52(%%r1,%[y]),2\n\t" - "vflls %%v30,%%v30\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vlef %%v31,56(%%r1,%[y]),0\n\t" - "vlef %%v31,60(%%r1,%[y]),2\n\t" - "vflls %%v31,%%v31\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vlef %%v16,0(%%r1,%[x]),0\n\t" + "vlef %%v16,4(%%r1,%[x]),2\n\t" + "vlef %%v17,8(%%r1,%[x]),0\n\t" + "vlef %%v17,12(%%r1,%[x]),2\n\t" + "vlef %%v18,16(%%r1,%[x]),0\n\t" + "vlef %%v18,20(%%r1,%[x]),2\n\t" + "vlef %%v19,24(%%r1,%[x]),0\n\t" + "vlef %%v19,28(%%r1,%[x]),2\n\t" + "vlef %%v20,32(%%r1,%[x]),0\n\t" + "vlef %%v20,36(%%r1,%[x]),2\n\t" + "vlef %%v21,40(%%r1,%[x]),0\n\t" + "vlef %%v21,44(%%r1,%[x]),2\n\t" + "vlef %%v22,48(%%r1,%[x]),0\n\t" + "vlef %%v22,52(%%r1,%[x]),2\n\t" + "vlef %%v23,56(%%r1,%[x]),0\n\t" + "vlef %%v23,60(%%r1,%[x]),2\n\t" + "vflls %%v16,%%v16\n\t" + "vflls %%v17,%%v17\n\t" + "vflls %%v18,%%v18\n\t" + "vflls %%v19,%%v19\n\t" + "vflls %%v20,%%v20\n\t" + "vflls %%v21,%%v21\n\t" + "vflls %%v22,%%v22\n\t" + "vflls %%v23,%%v23\n\t" + "vlef %%v24,0(%%r1,%[y]),0\n\t" + "vlef %%v24,4(%%r1,%[y]),2\n\t" + "vflls %%v24,%%v24\n\t" + "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" + "vlef %%v25,8(%%r1,%[y]),0\n\t" + "vlef %%v25,12(%%r1,%[y]),2\n\t" + "vflls %%v25,%%v25\n\t" + "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" + "vlef %%v26,16(%%r1,%[y]),0\n\t" + "vlef %%v26,20(%%r1,%[y]),2\n\t" + "vflls %%v26,%%v26\n\t" + "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" + "vlef %%v27,24(%%r1,%[y]),0\n\t" + "vlef %%v27,28(%%r1,%[y]),2\n\t" + "vflls %%v27,%%v27\n\t" + "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" + "vlef %%v28,32(%%r1,%[y]),0\n\t" + "vlef %%v28,36(%%r1,%[y]),2\n\t" + "vflls %%v28,%%v28\n\t" + "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" + "vlef %%v29,40(%%r1,%[y]),0\n\t" + "vlef %%v29,44(%%r1,%[y]),2\n\t" + "vflls %%v29,%%v29\n\t" + "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" + "vlef %%v30,48(%%r1,%[y]),0\n\t" + "vlef %%v30,52(%%r1,%[y]),2\n\t" + "vflls %%v30,%%v30\n\t" + "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" + "vlef %%v31,56(%%r1,%[y]),0\n\t" + "vlef %%v31,60(%%r1,%[y]),2\n\t" + "vflls %%v31,%%v31\n\t" + "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vfadb %%v0,%%v0,%%v2\n\t" + "vfadb %%v0,%%v0,%%v3\n\t" + "vfadb %%v0,%%v0,%%v4\n\t" + "vfadb %%v0,%%v0,%%v5\n\t" + "vfadb %%v0,%%v0,%%v6\n\t" + "vfadb %%v0,%%v0,%%v7\n\t" + "vrepg %%v1,%%v0,1\n\t" + "adbr %%f0,%%f1\n\t" + "ldr %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index 60ba40bd6..f0c9ded51 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c index 1e1040a6e..a2546b812 100644 --- a/kernel/zarch/icamax.c +++ b/kernel/zarch/icamax.c @@ -34,191 +34,191 @@ static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c index d1c0e32a1..09654b742 100644 --- a/kernel/zarch/icamin.c +++ b/kernel/zarch/icamin.c @@ -34,191 +34,191 @@ static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlef %%v1,4(%[x]),0\n\t" + "vlef %%v0,8(%[x]),1\n\t" + "vlef %%v1,12(%[x]),1\n\t" + "vlef %%v0,16(%[x]),2\n\t" + "vlef %%v1,20(%[x]),2\n\t" + "vlef %%v0,24(%[x]),3\n\t" + "vlef %%v1,28(%[x]),3\n\t" + "vflpsb %%v0,%%v0\n\t" + "vflpsb %%v1,%%v1\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,16\n\t" + "vzero %%v4\n\t" + "vleib %%v9,0,0\n\t" + "vleib %%v9,1,1\n\t" + "vleib %%v9,2,2\n\t" + "vleib %%v9,3,3\n\t" + "vleib %%v9,8,4\n\t" + "vleib %%v9,9,5\n\t" + "vleib %%v9,10,6\n\t" + "vleib %%v9,11,7\n\t" + "vleib %%v9,16,8\n\t" + "vleib %%v9,17,9\n\t" + "vleib %%v9,18,10\n\t" + "vleib %%v9,19,11\n\t" + "vleib %%v9,24,12\n\t" + "vleib %%v9,25,13\n\t" + "vleib %%v9,26,14\n\t" + "vleib %%v9,27,15\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v28,16(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v29,48(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v30,80(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v28,144(%%r1,%[x])\n\t" + "vpkg %%v17,%%v16,%%v28\n\t" + "vperm %%v16,%%v16,%%v28,%%v9\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v29,176(%%r1,%[x])\n\t" + "vpkg %%v19,%%v18,%%v29\n\t" + "vperm %%v18,%%v18,%%v29,%%v9\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v30,208(%%r1,%[x])\n\t" + "vpkg %%v21,%%v20,%%v30\n\t" + "vperm %%v20,%%v20,%%v30,%%v9\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vpkg %%v23,%%v22,%%v31\n\t" + "vperm %%v22,%%v22,%%v31,%%v9\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" + "vfasb %%v17,%%v18,%%v19\n\t" + "vfasb %%v18,%%v20,%%v21\n\t" + "vfasb %%v19,%%v22,%%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", + "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", + "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index 8434c811f..b292c1d15 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -34,138 +34,138 @@ static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index 80a37e6c2..f9a8119e1 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -34,138 +34,138 @@ static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpdb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c index 18cdba437..8f283bc17 100644 --- a/kernel/zarch/idmax.c +++ b/kernel/zarch/idmax.c @@ -31,121 +31,121 @@ static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vfchedb %%v6,%%v20,%%v21\n\t" + "vfchedb %%v7,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v16,%%v17\n\t" + "vfchedb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c index 02ca427e4..e4b7bb4fe 100644 --- a/kernel/zarch/idmin.c +++ b/kernel/zarch/idmin.c @@ -31,121 +31,121 @@ static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,16\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "vleig %%v28,8,0\n\t" + "vleig %%v28,9,1\n\t" + "vleig %%v29,10,0\n\t" + "vleig %%v29,11,1\n\t" + "vleig %%v30,12,0\n\t" + "vleig %%v30,13,1\n\t" + "vleig %%v31,14,0\n\t" + "vleig %%v31,15,1\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vfchedb %%v6,%%v21,%%v20\n\t" + "vfchedb %%v7,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vsel %%v18,%%v20,%%v21,%%v6\n\t" + "vsel %%v6,%%v28,%%v29,%%v6\n\t" + "vsel %%v19,%%v22,%%v23,%%v7\n\t" + "vsel %%v7,%%v30,%%v31,%%v7\n\t" + "vfchedb %%v20,%%v17,%%v16\n\t" + "vfchedb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v4,%%v4,%%v5,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v5,%%v6,%%v7,%%v21\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c index bbb4012aa..ac86435d7 100644 --- a/kernel/zarch/isamax.c +++ b/kernel/zarch/isamax.c @@ -34,182 +34,182 @@ static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamax; } diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c index e8b34b934..3f2d039eb 100644 --- a/kernel/zarch/isamin.c +++ b/kernel/zarch/isamin.c @@ -34,182 +34,182 @@ static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vflpsb %%v0,%%v0\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return iamin; } diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c index a565df503..41172c1bd 100644 --- a/kernel/zarch/ismax.c +++ b/kernel/zarch/ismax.c @@ -31,165 +31,165 @@ static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { BLASLONG imax; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=m"(*max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v16,%%v17\n\t" + "vfchesb %%v6,%%v18,%%v19\n\t" + "vfchesb %%v7,%%v20,%%v21\n\t" + "vfchesb %%v8,%%v22,%%v23\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v16,%%v17\n\t" + "vfchesb %%v21,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v0,%%v3\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[max],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v2,%%v0\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[max]\n\t" + "vlgvg %[imax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imax; } diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c index ff72b2c64..e2684df41 100644 --- a/kernel/zarch/ismin.c +++ b/kernel/zarch/ismin.c @@ -31,165 +31,165 @@ static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { BLASLONG imin; __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=m"(*min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vleig %%v1,0,0\n\t" + "vleig %%v1,2,1\n\t" + "vleig %%v2,1,0\n\t" + "vleig %%v2,3,1\n\t" + "vrepig %%v3,32\n\t" + "vzero %%v4\n\t" + "vleif %%v24,0,0\n\t" + "vleif %%v24,1,1\n\t" + "vleif %%v24,2,2\n\t" + "vleif %%v24,3,3\n\t" + "vleif %%v25,4,0\n\t" + "vleif %%v25,5,1\n\t" + "vleif %%v25,6,2\n\t" + "vleif %%v25,7,3\n\t" + "vleif %%v26,8,0\n\t" + "vleif %%v26,9,1\n\t" + "vleif %%v26,10,2\n\t" + "vleif %%v26,11,3\n\t" + "vleif %%v27,12,0\n\t" + "vleif %%v27,13,1\n\t" + "vleif %%v27,14,2\n\t" + "vleif %%v27,15,3\n\t" + "vleif %%v28,16,0\n\t" + "vleif %%v28,17,1\n\t" + "vleif %%v28,18,2\n\t" + "vleif %%v28,19,3\n\t" + "vleif %%v29,20,0\n\t" + "vleif %%v29,21,1\n\t" + "vleif %%v29,22,2\n\t" + "vleif %%v29,23,3\n\t" + "vleif %%v30,24,0\n\t" + "vleif %%v30,25,1\n\t" + "vleif %%v30,26,2\n\t" + "vleif %%v30,27,3\n\t" + "vleif %%v31,28,0\n\t" + "vleif %%v31,29,1\n\t" + "vleif %%v31,30,2\n\t" + "vleif %%v31,31,3\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,192(%%r1,%[x])\n\t" + "vl %%v21,208(%%r1,%[x])\n\t" + "vl %%v22,224(%%r1,%[x])\n\t" + "vl %%v23,240(%%r1,%[x])\n\t" + "vfchesb %%v5,%%v17,%%v16\n\t" + "vfchesb %%v6,%%v19,%%v18\n\t" + "vfchesb %%v7,%%v21,%%v20\n\t" + "vfchesb %%v8,%%v23,%%v22\n\t" + "vsel %%v16,%%v16,%%v17,%%v5\n\t" + "vsel %%v5,%%v24,%%v25,%%v5\n\t" + "vsel %%v17,%%v18,%%v19,%%v6\n\t" + "vsel %%v6,%%v26,%%v27,%%v6\n\t" + "vsel %%v18,%%v20,%%v21,%%v7\n\t" + "vsel %%v7,%%v28,%%v29,%%v7\n\t" + "vsel %%v19,%%v22,%%v23,%%v8\n\t" + "vsel %%v8,%%v30,%%v31,%%v8\n\t" + "vfchesb %%v20,%%v17,%%v16\n\t" + "vfchesb %%v21,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v20\n\t" + "vsel %%v5,%%v5,%%v6,%%v20\n\t" + "vsel %%v17,%%v18,%%v19,%%v21\n\t" + "vsel %%v6,%%v7,%%v8,%%v21\n\t" + "vfchesb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v5,%%v5,%%v6,%%v18\n\t" + "vsegf %%v6,%%v5\n\t" + "vesrlg %%v5,%%v5,32\n\t" + "vag %%v5,%%v5,%%v4\n\t" + "vag %%v6,%%v6,%%v4\n\t" + "vfchesb %%v7,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v7\n\t" + "vsegf %%v8,%%v7\n\t" + "vesrlg %%v7,%%v7,32\n\t" + "vsegf %%v7,%%v7\n\t" + "vsel %%v1,%%v1,%%v5,%%v7\n\t" + "vsel %%v2,%%v2,%%v6,%%v8\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v3,%%v0,32\n\t" + "vfchsb %%v4,%%v3,%%v0\n\t" + "vchlg %%v5,%%v2,%%v1\n\t" + "vfcesb %%v6,%%v0,%%v3\n\t" + "vn %%v5,%%v5,%%v6\n\t" + "vo %%v4,%%v4,%%v5\n\t" + "vsel %%v0,%%v0,%%v3,%%v4\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v1,%%v2,%%v4\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcsb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vstef %%v0,%[min],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[imin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchsb %%v4,%%v0,%%v2\n\t" + "vesrlg %%v4,%%v4,32\n\t" + "vsegf %%v4,%%v4\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "ste %%f0,%[min]\n\t" + "vlgvg %[imin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return imin; } diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index 48afb8215..daca1d6f7 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -34,134 +34,134 @@ static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { BLASLONG iamax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=m"(*amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v16,%%v17\n\t" + "vfchedb %%v5,%%v18,%%v19\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v16,%%v17\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amax],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamax],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v2,%%v0\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amax]\n\t" + "vlgvg %[iamax],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamax; } diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 3edbe3d58..9ababb91f 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -34,134 +34,134 @@ static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { BLASLONG iamin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=m"(*amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); + "vleg %%v1,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v1,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v1,%%v1\n\t" + "vfadb %%v0,%%v0,%%v1\n\t" + "vleig %%v1,0,0\n\t" + "vleig %%v1,1,1\n\t" + "vrepig %%v2,8\n\t" + "vzero %%v3\n\t" + "vleig %%v24,0,0\n\t" + "vleig %%v24,1,1\n\t" + "vleig %%v25,2,0\n\t" + "vleig %%v25,3,1\n\t" + "vleig %%v26,4,0\n\t" + "vleig %%v26,5,1\n\t" + "vleig %%v27,6,0\n\t" + "vleig %%v27,7,1\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchedb %%v4,%%v17,%%v16\n\t" + "vfchedb %%v5,%%v19,%%v18\n\t" + "vsel %%v16,%%v16,%%v17,%%v4\n\t" + "vsel %%v4,%%v24,%%v25,%%v4\n\t" + "vsel %%v17,%%v18,%%v19,%%v5\n\t" + "vsel %%v5,%%v26,%%v27,%%v5\n\t" + "vfchedb %%v18,%%v17,%%v16\n\t" + "vsel %%v16,%%v16,%%v17,%%v18\n\t" + "vsel %%v4,%%v4,%%v5,%%v18\n\t" + "vag %%v4,%%v4,%%v3\n\t" + "vfchedb %%v5,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v5\n\t" + "vsel %%v1,%%v1,%%v4,%%v5\n\t" + "vag %%v3,%%v3,%%v2\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v2,%%v0,1\n\t" + "vrepg %%v3,%%v1,1\n\t" + "wfcdb %%v2,%%v0\n\t" + "jne 1f\n\t" + "vsteg %%v0,%[amin],0\n\t" + "vmnlg %%v0,%%v1,%%v3\n\t" + "vlgvg %[iamin],%%v0,0\n\t" + "j 2f\n\t" + "1:\n\t" + "wfchdb %%v4,%%v0,%%v2\n\t" + "vsel %%v1,%%v3,%%v1,%%v4\n\t" + "vsel %%v0,%%v2,%%v0,%%v4\n\t" + "std %%f0,%[amin]\n\t" + "vlgvg %[iamin],%%v1,0\n\t" + "2:\n\t" + "nop" + : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", + "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); return iamin; } diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c index efbc0318c..fdda6dd32 100644 --- a/kernel/zarch/samax.c +++ b/kernel/zarch/samax.c @@ -34,53 +34,53 @@ static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,8\n\t" - "vfmaxsb %%v17,%%v17,%%v25,8\n\t" - "vfmaxsb %%v18,%%v18,%%v26,8\n\t" - "vfmaxsb %%v19,%%v19,%%v27,8\n\t" - "vfmaxsb %%v20,%%v20,%%v28,8\n\t" - "vfmaxsb %%v21,%%v21,%%v29,8\n\t" - "vfmaxsb %%v22,%%v22,%%v30,8\n\t" - "vfmaxsb %%v23,%%v23,%%v31,8\n\t" - "vfmaxsb %%v16,%%v16,%%v20,8\n\t" - "vfmaxsb %%v17,%%v17,%%v21,8\n\t" - "vfmaxsb %%v18,%%v18,%%v22,8\n\t" - "vfmaxsb %%v19,%%v19,%%v23,8\n\t" - "vfmaxsb %%v16,%%v16,%%v18,8\n\t" - "vfmaxsb %%v17,%%v17,%%v19,8\n\t" - "vfmaxsb %%v16,%%v16,%%v17,8\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,8\n\t" + "vfmaxsb %%v17,%%v17,%%v25,8\n\t" + "vfmaxsb %%v18,%%v18,%%v26,8\n\t" + "vfmaxsb %%v19,%%v19,%%v27,8\n\t" + "vfmaxsb %%v20,%%v20,%%v28,8\n\t" + "vfmaxsb %%v21,%%v21,%%v29,8\n\t" + "vfmaxsb %%v22,%%v22,%%v30,8\n\t" + "vfmaxsb %%v23,%%v23,%%v31,8\n\t" + "vfmaxsb %%v16,%%v16,%%v20,8\n\t" + "vfmaxsb %%v17,%%v17,%%v21,8\n\t" + "vfmaxsb %%v18,%%v18,%%v22,8\n\t" + "vfmaxsb %%v19,%%v19,%%v23,8\n\t" + "vfmaxsb %%v16,%%v16,%%v18,8\n\t" + "vfmaxsb %%v17,%%v17,%%v19,8\n\t" + "vfmaxsb %%v16,%%v16,%%v17,8\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c index 138836ce5..f05e851f9 100644 --- a/kernel/zarch/samin.c +++ b/kernel/zarch/samin.c @@ -34,53 +34,53 @@ static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,8\n\t" - "vfminsb %%v17,%%v17,%%v25,8\n\t" - "vfminsb %%v18,%%v18,%%v26,8\n\t" - "vfminsb %%v19,%%v19,%%v27,8\n\t" - "vfminsb %%v20,%%v20,%%v28,8\n\t" - "vfminsb %%v21,%%v21,%%v29,8\n\t" - "vfminsb %%v22,%%v22,%%v30,8\n\t" - "vfminsb %%v23,%%v23,%%v31,8\n\t" - "vfminsb %%v16,%%v16,%%v20,8\n\t" - "vfminsb %%v17,%%v17,%%v21,8\n\t" - "vfminsb %%v18,%%v18,%%v22,8\n\t" - "vfminsb %%v19,%%v19,%%v23,8\n\t" - "vfminsb %%v16,%%v16,%%v18,8\n\t" - "vfminsb %%v17,%%v17,%%v19,8\n\t" - "vfminsb %%v16,%%v16,%%v17,8\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,8\n\t" + "vfminsb %%v17,%%v17,%%v25,8\n\t" + "vfminsb %%v18,%%v18,%%v26,8\n\t" + "vfminsb %%v19,%%v19,%%v27,8\n\t" + "vfminsb %%v20,%%v20,%%v28,8\n\t" + "vfminsb %%v21,%%v21,%%v29,8\n\t" + "vfminsb %%v22,%%v22,%%v30,8\n\t" + "vfminsb %%v23,%%v23,%%v31,8\n\t" + "vfminsb %%v16,%%v16,%%v20,8\n\t" + "vfminsb %%v17,%%v17,%%v21,8\n\t" + "vfminsb %%v18,%%v18,%%v22,8\n\t" + "vfminsb %%v19,%%v19,%%v23,8\n\t" + "vfminsb %%v16,%%v16,%%v18,8\n\t" + "vfminsb %%v17,%%v17,%%v19,8\n\t" + "vfminsb %%v16,%%v16,%%v17,8\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,8\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,8\n\t" + "lper %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c index 0c3057a92..d56f2697b 100644 --- a/kernel/zarch/sasum.c +++ b/kernel/zarch/sasum.c @@ -34,83 +34,83 @@ static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpsb %%v16, %%v16\n\t" + "vflpsb %%v17, %%v17\n\t" + "vflpsb %%v18, %%v18\n\t" + "vflpsb %%v19, %%v19\n\t" + "vflpsb %%v20, %%v20\n\t" + "vflpsb %%v21, %%v21\n\t" + "vflpsb %%v22, %%v22\n\t" + "vflpsb %%v23, %%v23\n\t" + "vfasb %%v24,%%v24,%%v16\n\t" + "vfasb %%v25,%%v25,%%v17\n\t" + "vfasb %%v26,%%v26,%%v18\n\t" + "vfasb %%v27,%%v27,%%v19\n\t" + "vfasb %%v28,%%v28,%%v20\n\t" + "vfasb %%v29,%%v29,%%v21\n\t" + "vfasb %%v30,%%v30,%%v22\n\t" + "vfasb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vfasb %%v24,%%v24,%%v26\n\t" + "vfasb %%v24,%%v24,%%v27\n\t" + "vfasb %%v24,%%v24,%%v28\n\t" + "vfasb %%v24,%%v24,%%v29\n\t" + "vfasb %%v24,%%v24,%%v30\n\t" + "vfasb %%v24,%%v24,%%v31\n\t" + "veslg %%v25,%%v24,32\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vrepf %%v25,%%v24,2\n\t" + "vfasb %%v24,%%v24,%%v25\n\t" + "vstef %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c index e41e87af0..ca34a47ff 100644 --- a/kernel/zarch/saxpy.c +++ b/kernel/zarch/saxpy.c @@ -29,82 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,%[alpha]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), - [alpha] "m"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,0(%%r1,%[y])\n\t" + "vl %%v21,16(%%r1,%[y])\n\t" + "vl %%v22,32(%%r1,%[y])\n\t" + "vl %%v23,48(%%r1,%[y])\n\t" + "vl %%v24,64(%%r1,%[x])\n\t" + "vl %%v25,80(%%r1,%[x])\n\t" + "vl %%v26,96(%%r1,%[x])\n\t" + "vl %%v27,112(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,0(%%r1,%[y])\n\t" + "vst %%v17,16(%%r1,%[y])\n\t" + "vst %%v18,32(%%r1,%[y])\n\t" + "vst %%v19,48(%%r1,%[y])\n\t" + "vst %%v24,64(%%r1,%[y])\n\t" + "vst %%v25,80(%%r1,%[y])\n\t" + "vst %%v26,96(%%r1,%[y])\n\t" + "vst %%v27,112(%%r1,%[y])\n\t" + "vl %%v16,128(%%r1,%[x])\n\t" + "vl %%v17,144(%%r1,%[x])\n\t" + "vl %%v18,160(%%r1,%[x])\n\t" + "vl %%v19,176(%%r1,%[x])\n\t" + "vl %%v20,128(%%r1,%[y])\n\t" + "vl %%v21,144(%%r1,%[y])\n\t" + "vl %%v22,160(%%r1,%[y])\n\t" + "vl %%v23,176(%%r1,%[y])\n\t" + "vl %%v24,192(%%r1,%[x])\n\t" + "vl %%v25,208(%%r1,%[x])\n\t" + "vl %%v26,224(%%r1,%[x])\n\t" + "vl %%v27,240(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[y])\n\t" + "vl %%v29,208(%%r1,%[y])\n\t" + "vl %%v30,224(%%r1,%[y])\n\t" + "vl %%v31,240(%%r1,%[y])\n\t" + "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" + "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" + "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" + "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" + "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" + "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" + "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" + "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" + "vst %%v16,128(%%r1,%[y])\n\t" + "vst %%v17,144(%%r1,%[y])\n\t" + "vst %%v18,160(%%r1,%[y])\n\t" + "vst %%v19,176(%%r1,%[y])\n\t" + "vst %%v24,192(%%r1,%[y])\n\t" + "vst %%v25,208(%%r1,%[y])\n\t" + "vst %%v26,224(%%r1,%[y])\n\t" + "vst %%v27,240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + [alpha] "Q"(*alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c index 4e4993737..5c453cfbb 100644 --- a/kernel/zarch/scopy.c +++ b/kernel/zarch/scopy.c @@ -29,16 +29,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c index f659b0c8a..d870b30f0 100644 --- a/kernel/zarch/sdot.c +++ b/kernel/zarch/sdot.c @@ -31,64 +31,64 @@ static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { FLOAT dot; __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "vrepf %%v1,%%v0,1\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepf %%v3,%%v0,3\n\t" - "aebr %%f0,%%f1\n\t" - "aebr %%f0,%%f2\n\t" - "aebr %%f0,%%f3\n\t" - "ler %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x), "m"(*(const FLOAT (*)[n]) y), - [y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "pfd 1,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "vrepf %%v1,%%v0,1\n\t" + "vrepf %%v2,%%v0,2\n\t" + "vrepf %%v3,%%v0,3\n\t" + "aebr %%f0,%%f1\n\t" + "aebr %%f0,%%f2\n\t" + "aebr %%f0,%%f3\n\t" + "ler %[dot],%%f0" + : [dot] "=f"(dot),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); return dot; } diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c index 86ac24993..a1efef373 100644 --- a/kernel/zarch/sgemv_n_4.c +++ b/kernel/zarch/sgemv_n_4.c @@ -31,304 +31,314 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,8(%[x])\n\t" - "vlrepf %%v3,12(%[x])\n\t" - "vlrepf %%v4,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v4\n\t" - "vfmsb %%v1,%%v1,%%v4\n\t" - "vfmsb %%v2,%%v2,%%v4\n\t" - "vfmsb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,8(%[x])\n\t" + "vlrepf %%v3,12(%[x])\n\t" + "vlrepf %%v4,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v4\n\t" + "vfmsb %%v1,%%v1,%%v4\n\t" + "vfmsb %%v2,%%v2,%%v4\n\t" + "vfmsb %%v3,%%v3,%%v4\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v20,16(%%r1,%[ap0])\n\t" + "vl %%v21,16(%%r1,%[ap1])\n\t" + "vl %%v22,16(%%r1,%[ap2])\n\t" + "vl %%v23,16(%%r1,%[ap3])\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vl %%v5,16(%%r1,%[y])\n\t" + "vl %%v6,32(%%r1,%[y])\n\t" + "vl %%v7,48(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "vst %%v5,16(%%r1,%[y])\n\t" + "vst %%v6,32(%%r1,%[y])\n\t" + "vst %%v7,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[ap0])\n\t" + "vl %%v17,64(%%r1,%[ap1])\n\t" + "vl %%v18,64(%%r1,%[ap2])\n\t" + "vl %%v19,64(%%r1,%[ap3])\n\t" + "vl %%v20,80(%%r1,%[ap0])\n\t" + "vl %%v21,80(%%r1,%[ap1])\n\t" + "vl %%v22,80(%%r1,%[ap2])\n\t" + "vl %%v23,80(%%r1,%[ap3])\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vl %%v4,64(%%r1,%[y])\n\t" + "vl %%v5,80(%%r1,%[y])\n\t" + "vl %%v6,96(%%r1,%[y])\n\t" + "vl %%v7,112(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" + "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" + "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" + "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" + "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" + "vst %%v4,64(%%r1,%[y])\n\t" + "vst %%v5,80(%%r1,%[y])\n\t" + "vst %%v6,96(%%r1,%[y])\n\t" + "vst %%v7,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,0(%%r1,%[ap2])\n\t" + "vl %%v19,0(%%r1,%[ap3])\n\t" + "vl %%v4,0(%%r1,%[y])\n\t" + "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" + "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" + "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" + "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" + "vst %%v4,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v2\n\t" - "vfmsb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x),[alpha] "m"(*alpha),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vlrepf %%v1,4(%[x])\n\t" + "vlrepf %%v2,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v2\n\t" + "vfmsb %%v1,%%v1,%%v2\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v18,16(%%r1,%[ap0])\n\t" + "vl %%v19,16(%%r1,%[ap1])\n\t" + "vl %%v20,32(%%r1,%[ap0])\n\t" + "vl %%v21,32(%%r1,%[ap1])\n\t" + "vl %%v22,48(%%r1,%[ap0])\n\t" + "vl %%v23,48(%%r1,%[ap1])\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vl %%v3,16(%%r1,%[y])\n\t" + "vl %%v4,32(%%r1,%[y])\n\t" + "vl %%v5,48(%%r1,%[y])\n\t" + "vl %%v6,64(%%r1,%[y])\n\t" + "vl %%v7,80(%%r1,%[y])\n\t" + "vl %%v8,96(%%r1,%[y])\n\t" + "vl %%v9,112(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" + "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" + "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" + "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" + "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" + "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" + "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" + "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" + "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" + "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" + "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" + "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" + "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "vst %%v3,16(%%r1,%[y])\n\t" + "vst %%v4,32(%%r1,%[y])\n\t" + "vst %%v5,48(%%r1,%[y])\n\t" + "vst %%v6,64(%%r1,%[y])\n\t" + "vst %%v7,80(%%r1,%[y])\n\t" + "vst %%v8,96(%%r1,%[y])\n\t" + "vst %%v9,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[ap0])\n\t" + "vl %%v17,0(%%r1,%[ap1])\n\t" + "vl %%v2,0(%%r1,%[y])\n\t" + "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" + "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" + "vst %%v2,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v16,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,0(%%r1,%[y])\n\t" - "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" - "vst %%v17,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "m"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v16,%[alpha]\n\t" + "vfmsb %%v0,%%v0,%%v16\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,16(%%r1,%[a0])\n\t" + "vl %%v18,32(%%r1,%[a0])\n\t" + "vl %%v19,48(%%r1,%[a0])\n\t" + "vl %%v20,64(%%r1,%[a0])\n\t" + "vl %%v21,80(%%r1,%[a0])\n\t" + "vl %%v22,96(%%r1,%[a0])\n\t" + "vl %%v23,112(%%r1,%[a0])\n\t" + "vl %%v24,0(%%r1,%[y])\n\t" + "vl %%v25,16(%%r1,%[y])\n\t" + "vl %%v26,32(%%r1,%[y])\n\t" + "vl %%v27,48(%%r1,%[y])\n\t" + "vl %%v28,64(%%r1,%[y])\n\t" + "vl %%v29,80(%%r1,%[y])\n\t" + "vl %%v30,96(%%r1,%[y])\n\t" + "vl %%v31,112(%%r1,%[y])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v24,0(%%r1,%[y])\n\t" + "vst %%v25,16(%%r1,%[y])\n\t" + "vst %%v26,32(%%r1,%[y])\n\t" + "vst %%v27,48(%%r1,%[y])\n\t" + "vst %%v28,64(%%r1,%[y])\n\t" + "vst %%v29,80(%%r1,%[y])\n\t" + "vst %%v30,96(%%r1,%[y])\n\t" + "vst %%v31,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[a0])\n\t" + "vl %%v17,0(%%r1,%[y])\n\t" + "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" + "vst %%v17,0(%%r1,%[y])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), + [n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c index 6ae9b6d7f..81d7c9fe7 100644 --- a/kernel/zarch/sgemv_t_4.c +++ b/kernel/zarch/sgemv_t_4.c @@ -30,330 +30,338 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 2048 static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v2,%%v2,%%v6\n\t" - "vfasb %%v3,%%v3,%%v7\n\t" - "veslg %%v4,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vrepg %%v4,%%v0,1\n\t" - "aebr %%f0,%%f4\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v4,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v4\n\t" - "vrepg %%v4,%%v1,1\n\t" - "aebr %%f1,%%f4\n\t" - "ste %%f1,4(%[y])\n\t" - "veslg %%v4,%%v2,32\n\t" - "vfasb %%v2,%%v2,%%v4\n\t" - "vrepg %%v4,%%v2,1\n\t" - "aebr %%f2,%%f4\n\t" - "ste %%f2,8(%[y])\n\t" - "veslg %%v4,%%v3,32\n\t" - "vfasb %%v3,%%v3,%%v4\n\t" - "vrepg %%v4,%%v3,1\n\t" - "aebr %%f3,%%f4\n\t" - "ste %%f3,12(%[y])" - : "=m"(*(FLOAT (*)[4]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "vl %%v28,16(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" + "vl %%v29,16(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" + "vl %%v30,16(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" + "vl %%v31,16(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" + "vl %%v24,32(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" + "vl %%v25,32(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,32(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" + "vl %%v28,48(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" + "vl %%v29,48(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,64(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" + "vl %%v27,64(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" + "vl %%v28,80(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,80(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" + "vl %%v31,80(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" + "vl %%v24,96(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" + "vl %%v25,96(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" + "vl %%v26,96(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" + "vl %%v27,96(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" + "vl %%v28,112(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" + "vl %%v29,112(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap2])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap3])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,0(%%r1,%[ap2])\n\t" + "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" + "vl %%v27,0(%%r1,%[ap3])\n\t" + "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v2,%%v2,%%v6\n\t" + "vfasb %%v3,%%v3,%%v7\n\t" + "veslg %%v4,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vrepg %%v4,%%v0,1\n\t" + "aebr %%f0,%%f4\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v4,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v4\n\t" + "vrepg %%v4,%%v1,1\n\t" + "aebr %%f1,%%f4\n\t" + "ste %%f1,4(%[y])\n\t" + "veslg %%v4,%%v2,32\n\t" + "vfasb %%v2,%%v2,%%v4\n\t" + "vrepg %%v4,%%v2,1\n\t" + "aebr %%f2,%%f4\n\t" + "ste %%f2,8(%[y])\n\t" + "veslg %%v4,%%v3,32\n\t" + "vfasb %%v3,%%v3,%%v4\n\t" + "vrepg %%v4,%%v3,1\n\t" + "aebr %%f3,%%f4\n\t" + "ste %%f3,12(%[y])" + : "=m"(*(struct { FLOAT x[4]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v1,%%v1,%%v3\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v1,%%v1,%%v7\n\t" - "veslg %%v2,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vrepg %%v2,%%v0,1\n\t" - "aebr %%f0,%%f2\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v2,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v2\n\t" - "vrepg %%v2,%%v1,1\n\t" - "aebr %%f1,%%f2\n\t" - "ste %%f1,4(%[y])" - : "=m"(*(FLOAT (*)[2]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "vl %%v26,16(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" + "vl %%v27,16(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" + "vl %%v28,32(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" + "vl %%v29,32(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" + "vl %%v30,48(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" + "vl %%v31,48(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" + "vl %%v24,64(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" + "vl %%v25,64(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" + "vl %%v26,80(%%r1,%[ap0])\n\t" + "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" + "vl %%v27,80(%%r1,%[ap1])\n\t" + "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" + "vl %%v28,96(%%r1,%[ap0])\n\t" + "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" + "vl %%v29,96(%%r1,%[ap1])\n\t" + "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" + "vl %%v30,112(%%r1,%[ap0])\n\t" + "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[ap1])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[ap0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,0(%%r1,%[ap1])\n\t" + "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v1,%%v1,%%v3\n\t" + "vfasb %%v1,%%v1,%%v5\n\t" + "vfasb %%v1,%%v1,%%v7\n\t" + "veslg %%v2,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vrepg %%v2,%%v0,1\n\t" + "aebr %%f0,%%f2\n\t" + "ste %%f0,0(%[y])\n\t" + "veslg %%v2,%%v1,32\n\t" + "vfasb %%v1,%%v1,%%v2\n\t" + "vrepg %%v2,%%v1,1\n\t" + "aebr %%f1,%%f2\n\t" + "ste %%f1,4(%[y])" + : "=m"(*(struct { FLOAT x[2]; } *) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "veslg %%v1,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vrepg %%v1,%%v0,1\n\t" - "aebr %%f0,%%f1\n\t" - "ste %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const FLOAT (*)[n]) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[n]) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v1\n\t" + "vzero %%v2\n\t" + "vzero %%v3\n\t" + "vzero %%v4\n\t" + "vzero %%v5\n\t" + "vzero %%v6\n\t" + "vzero %%v7\n\t" + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[a0])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "vl %%v25,16(%%r1,%[a0])\n\t" + "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" + "vl %%v26,32(%%r1,%[a0])\n\t" + "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" + "vl %%v27,48(%%r1,%[a0])\n\t" + "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" + "vl %%v28,64(%%r1,%[a0])\n\t" + "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" + "vl %%v29,80(%%r1,%[a0])\n\t" + "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" + "vl %%v30,96(%%r1,%[a0])\n\t" + "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" + "vl %%v31,112(%%r1,%[a0])\n\t" + "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[a0])\n\t" + "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vfasb %%v0,%%v0,%%v2\n\t" + "vfasb %%v0,%%v0,%%v3\n\t" + "vfasb %%v0,%%v0,%%v4\n\t" + "vfasb %%v0,%%v0,%%v5\n\t" + "vfasb %%v0,%%v0,%%v6\n\t" + "vfasb %%v0,%%v0,%%v7\n\t" + "veslg %%v1,%%v0,32\n\t" + "vfasb %%v0,%%v0,%%v1\n\t" + "vrepg %%v1,%%v0,1\n\t" + "aebr %%f0,%%f1\n\t" + "ste %%f0,0(%[y])" + : "=m"(*(FLOAT (*)[1]) y) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), + "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", + "v26", "v27", "v28", "v29", "v30", "v31"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { @@ -366,70 +374,70 @@ static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { __asm__("vlrepf %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(FLOAT (*)[n]) dest) - : [dest] "a"(dest),[da] "m"(da), "m"(*(const FLOAT (*)[n]) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "lghi %%r0,-32\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 1f\n\t" + "srlg %%r0,%%r0,5\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,64(%%r1,%[src])\n\t" + "vl %%v21,80(%%r1,%[src])\n\t" + "vl %%v22,96(%%r1,%[src])\n\t" + "vl %%v23,112(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "vl %%v25, 16(%%r1,%[dest])\n\t" + "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" + "vst %%v25, 16(%%r1,%[dest])\n\t" + "vl %%v26, 32(%%r1,%[dest])\n\t" + "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" + "vst %%v26, 32(%%r1,%[dest])\n\t" + "vl %%v27, 48(%%r1,%[dest])\n\t" + "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" + "vst %%v27, 48(%%r1,%[dest])\n\t" + "vl %%v28, 64(%%r1,%[dest])\n\t" + "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" + "vst %%v28, 64(%%r1,%[dest])\n\t" + "vl %%v29, 80(%%r1,%[dest])\n\t" + "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" + "vst %%v29, 80(%%r1,%[dest])\n\t" + "vl %%v30, 96(%%r1,%[dest])\n\t" + "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" + "vst %%v30, 96(%%r1,%[dest])\n\t" + "vl %%v31, 112(%%r1,%[dest])\n\t" + "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" + "vst %%v31, 112(%%r1,%[dest])\n\t" + "agfi %%r1,128\n\t" + "brctg %%r0,0b\n\t" + "1:\n\t" + "lghi %%r0,28\n\t" + "ngr %%r0,%[n]\n\t" + "ltgr %%r0,%%r0\n\t" + "jz 3f\n\t" + "srlg %%r0,%%r0,2\n\t" + "2:\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v24, 0(%%r1,%[dest])\n\t" + "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" + "vst %%v24, 0(%%r1,%[dest])\n\t" + "agfi %%r1,16\n\t" + "brctg %%r0,2b\n\t" + "3:\n\t" + "nop" + : "+m"(*(struct { FLOAT x[n]; } *) dest) + : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), + [src] "a"(src),[n] "r"(n) + : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c index df3c9cb4d..7015aaa1d 100644 --- a/kernel/zarch/smax.c +++ b/kernel/zarch/smax.c @@ -31,53 +31,53 @@ static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { FLOAT max; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v17,%%v17,%%v25,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v19,%%v19,%%v27,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v21,%%v21,%%v29,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v23,%%v23,%%v31,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v17,%%v17,%%v21,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v19,%%v19,%%v23,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v17,%%v17,%%v19,0\n\t" - "vfmaxsb %%v16,%%v16,%%v17,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfmaxsb %%v16,%%v16,%%v24,0\n\t" + "vfmaxsb %%v17,%%v17,%%v25,0\n\t" + "vfmaxsb %%v18,%%v18,%%v26,0\n\t" + "vfmaxsb %%v19,%%v19,%%v27,0\n\t" + "vfmaxsb %%v20,%%v20,%%v28,0\n\t" + "vfmaxsb %%v21,%%v21,%%v29,0\n\t" + "vfmaxsb %%v22,%%v22,%%v30,0\n\t" + "vfmaxsb %%v23,%%v23,%%v31,0\n\t" + "vfmaxsb %%v16,%%v16,%%v20,0\n\t" + "vfmaxsb %%v17,%%v17,%%v21,0\n\t" + "vfmaxsb %%v18,%%v18,%%v22,0\n\t" + "vfmaxsb %%v19,%%v19,%%v23,0\n\t" + "vfmaxsb %%v16,%%v16,%%v18,0\n\t" + "vfmaxsb %%v17,%%v17,%%v19,0\n\t" + "vfmaxsb %%v16,%%v16,%%v17,0\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfmaxsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfmaxsb %%v0,%%v0,%%v16,0\n\t" + "ler %[max],%%f0" + : [max] "=f"(max),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return max; } diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c index 2e9c793c4..b6875c5c6 100644 --- a/kernel/zarch/smin.c +++ b/kernel/zarch/smin.c @@ -31,53 +31,53 @@ static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { FLOAT min; __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v17,%%v17,%%v25,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v19,%%v19,%%v27,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v21,%%v21,%%v29,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v23,%%v23,%%v31,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v17,%%v17,%%v21,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v19,%%v19,%%v23,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v17,%%v17,%%v19,0\n\t" - "vfminsb %%v16,%%v16,%%v17,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vl %%v24,128(%%r1,%[x])\n\t" + "vl %%v25,144(%%r1,%[x])\n\t" + "vl %%v26,160(%%r1,%[x])\n\t" + "vl %%v27,176(%%r1,%[x])\n\t" + "vl %%v28,192(%%r1,%[x])\n\t" + "vl %%v29,208(%%r1,%[x])\n\t" + "vl %%v30,224(%%r1,%[x])\n\t" + "vl %%v31,240(%%r1,%[x])\n\t" + "vfminsb %%v16,%%v16,%%v24,0\n\t" + "vfminsb %%v17,%%v17,%%v25,0\n\t" + "vfminsb %%v18,%%v18,%%v26,0\n\t" + "vfminsb %%v19,%%v19,%%v27,0\n\t" + "vfminsb %%v20,%%v20,%%v28,0\n\t" + "vfminsb %%v21,%%v21,%%v29,0\n\t" + "vfminsb %%v22,%%v22,%%v30,0\n\t" + "vfminsb %%v23,%%v23,%%v31,0\n\t" + "vfminsb %%v16,%%v16,%%v20,0\n\t" + "vfminsb %%v17,%%v17,%%v21,0\n\t" + "vfminsb %%v18,%%v18,%%v22,0\n\t" + "vfminsb %%v19,%%v19,%%v23,0\n\t" + "vfminsb %%v16,%%v16,%%v18,0\n\t" + "vfminsb %%v17,%%v17,%%v19,0\n\t" + "vfminsb %%v16,%%v16,%%v17,0\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "veslg %%v16,%%v0,32\n\t" + "vfminsb %%v0,%%v0,%%v16,0\n\t" + "vrepf %%v16,%%v0,2\n\t" + "wfminsb %%v0,%%v0,%%v16,0\n\t" + "ler %[min],%%f0" + : [min] "=f"(min),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return min; } diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c index 5b21a19dc..4f471d866 100644 --- a/kernel/zarch/srot.c +++ b/kernel/zarch/srot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepf %%v1,%[s]\n\t" + "srlg %[n],%[n],6\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmsb %%v28,%%v24,%%v0\n\t" + "vfmsb %%v29,%%v25,%%v0\n\t" + "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v30,%%v26,%%v0\n\t" + "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmsb %%v31,%%v27,%%v0\n\t" + "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c index 07e6845c6..9b9930dc8 100644 --- a/kernel/zarch/sscal.c +++ b/kernel/zarch/sscal.c @@ -29,61 +29,61 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { __asm__("vlrepf %%v0,%[da]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmsb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmsb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmsb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmsb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmsb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmsb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmsb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmsb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x),[da] "m"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v24,0(%%r1,%[x])\n\t" + "vfmsb %%v24,%%v24,%%v0\n\t" + "vst %%v24,0(%%r1,%[x])\n\t" + "vl %%v25,16(%%r1,%[x])\n\t" + "vfmsb %%v25,%%v25,%%v0\n\t" + "vst %%v25,16(%%r1,%[x])\n\t" + "vl %%v26,32(%%r1,%[x])\n\t" + "vfmsb %%v26,%%v26,%%v0\n\t" + "vst %%v26,32(%%r1,%[x])\n\t" + "vl %%v27,48(%%r1,%[x])\n\t" + "vfmsb %%v27,%%v27,%%v0\n\t" + "vst %%v27,48(%%r1,%[x])\n\t" + "vl %%v28,64(%%r1,%[x])\n\t" + "vfmsb %%v28,%%v28,%%v0\n\t" + "vst %%v28,64(%%r1,%[x])\n\t" + "vl %%v29,80(%%r1,%[x])\n\t" + "vfmsb %%v29,%%v29,%%v0\n\t" + "vst %%v29,80(%%r1,%[x])\n\t" + "vl %%v30,96(%%r1,%[x])\n\t" + "vfmsb %%v30,%%v30,%%v0\n\t" + "vst %%v30,96(%%r1,%[x])\n\t" + "vl %%v31,112(%%r1,%[x])\n\t" + "vfmsb %%v31,%%v31,%%v0\n\t" + "vst %%v31,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x),[da] "Q"(da) + : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],5\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c index dc7113143..0c62f189d 100644 --- a/kernel/zarch/sswap.c +++ b/kernel/zarch/sswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n]) x), "+m"(*(FLOAT (*)[n]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), + [n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c index 531e47a0b..aa04ab91f 100644 --- a/kernel/zarch/zamax.c +++ b/kernel/zarch/zamax.c @@ -34,89 +34,89 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmaxdb %%v16,%%v16,%%v24,0\n\t" + "vfmaxdb %%v18,%%v18,%%v26,0\n\t" + "vfmaxdb %%v20,%%v20,%%v28,0\n\t" + "vfmaxdb %%v22,%%v22,%%v30,0\n\t" + "vfmaxdb %%v16,%%v16,%%v20,0\n\t" + "vfmaxdb %%v18,%%v18,%%v22,0\n\t" + "vfmaxdb %%v16,%%v16,%%v18,0\n\t" + "vfmaxdb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmaxdb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amax; } diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c index cac2da938..37278d6db 100644 --- a/kernel/zarch/zamax_z13.c +++ b/kernel/zarch/zamax_z13.c @@ -34,98 +34,98 @@ static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amax; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v16,%%v17\n\t" + "vfchdb %%v25,%%v18,%%v19\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v24,%%v25\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v26,%%v0\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v0,%%v16\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amax],%%f0" + : [amax] "=f"(amax),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amax; } diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c index 940d81dd2..0b5402853 100644 --- a/kernel/zarch/zamin.c +++ b/kernel/zarch/zamin.c @@ -34,89 +34,89 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vleg %%v24,128(%%r1,%[x]),0\n\t" + "vleg %%v25,136(%%r1,%[x]),0\n\t" + "vleg %%v24,144(%%r1,%[x]),1\n\t" + "vleg %%v25,152(%%r1,%[x]),1\n\t" + "vleg %%v26,160(%%r1,%[x]),0\n\t" + "vleg %%v27,168(%%r1,%[x]),0\n\t" + "vleg %%v26,176(%%r1,%[x]),1\n\t" + "vleg %%v27,184(%%r1,%[x]),1\n\t" + "vleg %%v28,192(%%r1,%[x]),0\n\t" + "vleg %%v29,200(%%r1,%[x]),0\n\t" + "vleg %%v28,208(%%r1,%[x]),1\n\t" + "vleg %%v29,216(%%r1,%[x]),1\n\t" + "vleg %%v30,224(%%r1,%[x]),0\n\t" + "vleg %%v31,232(%%r1,%[x]),0\n\t" + "vleg %%v30,240(%%r1,%[x]),1\n\t" + "vleg %%v31,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16,%%v16\n\t" + "vflpdb %%v17,%%v17\n\t" + "vflpdb %%v18,%%v18\n\t" + "vflpdb %%v19,%%v19\n\t" + "vflpdb %%v20,%%v20\n\t" + "vflpdb %%v21,%%v21\n\t" + "vflpdb %%v22,%%v22\n\t" + "vflpdb %%v23,%%v23\n\t" + "vflpdb %%v24,%%v24\n\t" + "vflpdb %%v25,%%v25\n\t" + "vflpdb %%v26,%%v26\n\t" + "vflpdb %%v27,%%v27\n\t" + "vflpdb %%v28,%%v28\n\t" + "vflpdb %%v29,%%v29\n\t" + "vflpdb %%v30,%%v30\n\t" + "vflpdb %%v31,%%v31\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v18,%%v18,%%v19\n\t" + "vfadb %%v20,%%v20,%%v21\n\t" + "vfadb %%v22,%%v22,%%v23\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v26,%%v26,%%v27\n\t" + "vfadb %%v28,%%v28,%%v29\n\t" + "vfadb %%v30,%%v30,%%v31\n\t" + "vfmindb %%v16,%%v16,%%v24,0\n\t" + "vfmindb %%v18,%%v18,%%v26,0\n\t" + "vfmindb %%v20,%%v20,%%v28,0\n\t" + "vfmindb %%v22,%%v22,%%v30,0\n\t" + "vfmindb %%v16,%%v16,%%v20,0\n\t" + "vfmindb %%v18,%%v18,%%v22,0\n\t" + "vfmindb %%v16,%%v16,%%v18,0\n\t" + "vfmindb %%v0,%%v0,%%v16,0\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfmindb %%v0,%%v0,%%v16,0\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return amin; } diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c index 7417e0b74..e37bb2236 100644 --- a/kernel/zarch/zamin_z13.c +++ b/kernel/zarch/zamin_z13.c @@ -34,98 +34,98 @@ static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { FLOAT amin; __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n]) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); + "vleg %%v16,8(%[x]),0\n\t" + "vleg %%v0,16(%[x]),1\n\t" + "vleg %%v16,24(%[x]),1\n\t" + "vflpdb %%v0,%%v0\n\t" + "vflpdb %%v16,%%v16\n\t" + "vfadb %%v0,%%v0,%%v16\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vleg %%v16,0(%%r1,%[x]),0\n\t" + "vleg %%v17,8(%%r1,%[x]),0\n\t" + "vleg %%v16,16(%%r1,%[x]),1\n\t" + "vleg %%v17,24(%%r1,%[x]),1\n\t" + "vleg %%v18,32(%%r1,%[x]),0\n\t" + "vleg %%v19,40(%%r1,%[x]),0\n\t" + "vleg %%v18,48(%%r1,%[x]),1\n\t" + "vleg %%v19,56(%%r1,%[x]),1\n\t" + "vleg %%v20,64(%%r1,%[x]),0\n\t" + "vleg %%v21,72(%%r1,%[x]),0\n\t" + "vleg %%v20,80(%%r1,%[x]),1\n\t" + "vleg %%v21,88(%%r1,%[x]),1\n\t" + "vleg %%v22,96(%%r1,%[x]),0\n\t" + "vleg %%v23,104(%%r1,%[x]),0\n\t" + "vleg %%v22,112(%%r1,%[x]),1\n\t" + "vleg %%v23,120(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "vleg %%v16,128(%%r1,%[x]),0\n\t" + "vleg %%v17,136(%%r1,%[x]),0\n\t" + "vleg %%v16,144(%%r1,%[x]),1\n\t" + "vleg %%v17,152(%%r1,%[x]),1\n\t" + "vleg %%v18,160(%%r1,%[x]),0\n\t" + "vleg %%v19,168(%%r1,%[x]),0\n\t" + "vleg %%v18,176(%%r1,%[x]),1\n\t" + "vleg %%v19,184(%%r1,%[x]),1\n\t" + "vleg %%v20,192(%%r1,%[x]),0\n\t" + "vleg %%v21,200(%%r1,%[x]),0\n\t" + "vleg %%v20,208(%%r1,%[x]),1\n\t" + "vleg %%v21,216(%%r1,%[x]),1\n\t" + "vleg %%v22,224(%%r1,%[x]),0\n\t" + "vleg %%v23,232(%%r1,%[x]),0\n\t" + "vleg %%v22,240(%%r1,%[x]),1\n\t" + "vleg %%v23,248(%%r1,%[x]),1\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vfadb %%v17,%%v18,%%v19\n\t" + "vfadb %%v18,%%v20,%%v21\n\t" + "vfadb %%v19,%%v22,%%v23\n\t" + "vfchdb %%v24,%%v17,%%v16\n\t" + "vfchdb %%v25,%%v19,%%v18\n\t" + "vsel %%v24,%%v16,%%v17,%%v24\n\t" + "vsel %%v25,%%v18,%%v19,%%v25\n\t" + "vfchdb %%v26,%%v25,%%v24\n\t" + "vsel %%v26,%%v24,%%v25,%%v26\n\t" + "vfchdb %%v27,%%v0,%%v26\n\t" + "vsel %%v0,%%v26,%%v0,%%v27\n\t" + "agfi %%r1, 256\n\t" + "brctg %[n], 0b\n\t" + "vrepg %%v16,%%v0,1\n\t" + "wfchdb %%v17,%%v16,%%v0\n\t" + "vsel %%v0,%%v0,%%v16,%%v17\n\t" + "ldr %[amin],%%f0" + : [amin] "=f"(amin),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23", "v24", "v25", "v26", "v27"); return amin; } diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index 43ae8ff8b..aeef8d77e 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -34,81 +34,81 @@ static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { FLOAT asum; __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=m"(asum),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "vl %%v16, 128(%%r1,%[x])\n\t" + "vl %%v17, 144(%%r1,%[x])\n\t" + "vl %%v18, 160(%%r1,%[x])\n\t" + "vl %%v19, 176(%%r1,%[x])\n\t" + "vl %%v20, 192(%%r1,%[x])\n\t" + "vl %%v21, 208(%%r1,%[x])\n\t" + "vl %%v22, 224(%%r1,%[x])\n\t" + "vl %%v23, 240(%%r1,%[x])\n\t" + "vflpdb %%v16, %%v16\n\t" + "vflpdb %%v17, %%v17\n\t" + "vflpdb %%v18, %%v18\n\t" + "vflpdb %%v19, %%v19\n\t" + "vflpdb %%v20, %%v20\n\t" + "vflpdb %%v21, %%v21\n\t" + "vflpdb %%v22, %%v22\n\t" + "vflpdb %%v23, %%v23\n\t" + "vfadb %%v24,%%v24,%%v16\n\t" + "vfadb %%v25,%%v25,%%v17\n\t" + "vfadb %%v26,%%v26,%%v18\n\t" + "vfadb %%v27,%%v27,%%v19\n\t" + "vfadb %%v28,%%v28,%%v20\n\t" + "vfadb %%v29,%%v29,%%v21\n\t" + "vfadb %%v30,%%v30,%%v22\n\t" + "vfadb %%v31,%%v31,%%v23\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v27\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v29\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v24,%%v24,%%v31\n\t" + "vrepg %%v25,%%v24,1\n\t" + "vfadb %%v24,%%v24,%%v25\n\t" + "vsteg %%v24,%[asum],0" + : [asum] "=Q"(asum),[n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) + : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); return asum; } diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 31549849d..9363ec32d 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -30,77 +30,77 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__( #if !defined(CONJ) - "vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" + "vlrepg %%v0,0(%[alpha])\n\t" + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" #else - "vleg %%v0,0(%[alpha]),1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,0(%[alpha]),0\n\t" - "vlrepg %%v1,8(%[alpha])\n\t" + "vleg %%v0,0(%[alpha]),1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,0(%[alpha]),0\n\t" + "vlrepg %%v1,8(%[alpha])\n\t" #endif - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vpdi %%v24,%%v8,%%v8,4\n\t" - "vpdi %%v25,%%v9,%%v9,4\n\t" - "vpdi %%v26,%%v10,%%v10,4\n\t" - "vpdi %%v27,%%v11,%%v11,4\n\t" - "vpdi %%v28,%%v16,%%v16,4\n\t" - "vpdi %%v29,%%v17,%%v17,4\n\t" - "vpdi %%v30,%%v18,%%v18,4\n\t" - "vpdi %%v31,%%v19,%%v19,4\n\t" - "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v8,0(%%r1,%[x])\n\t" + "vl %%v9,16(%%r1,%[x])\n\t" + "vl %%v10,32(%%r1,%[x])\n\t" + "vl %%v11,48(%%r1,%[x])\n\t" + "vl %%v12,0(%%r1,%[y])\n\t" + "vl %%v13,16(%%r1,%[y])\n\t" + "vl %%v14,32(%%r1,%[y])\n\t" + "vl %%v15,48(%%r1,%[y])\n\t" + "vl %%v16,64(%%r1,%[x])\n\t" + "vl %%v17,80(%%r1,%[x])\n\t" + "vl %%v18,96(%%r1,%[x])\n\t" + "vl %%v19,112(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[y])\n\t" + "vl %%v21,80(%%r1,%[y])\n\t" + "vl %%v22,96(%%r1,%[y])\n\t" + "vl %%v23,112(%%r1,%[y])\n\t" + "vpdi %%v24,%%v8,%%v8,4\n\t" + "vpdi %%v25,%%v9,%%v9,4\n\t" + "vpdi %%v26,%%v10,%%v10,4\n\t" + "vpdi %%v27,%%v11,%%v11,4\n\t" + "vpdi %%v28,%%v16,%%v16,4\n\t" + "vpdi %%v29,%%v17,%%v17,4\n\t" + "vpdi %%v30,%%v18,%%v18,4\n\t" + "vpdi %%v31,%%v19,%%v19,4\n\t" + "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" + "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" + "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" + "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" + "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" + "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" + "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" + "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" + "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" + "vst %%v8,0(%%r1,%[y])\n\t" + "vst %%v9,16(%%r1,%[y])\n\t" + "vst %%v10,32(%%r1,%[y])\n\t" + "vst %%v11,48(%%r1,%[y])\n\t" + "vst %%v16,64(%%r1,%[y])\n\t" + "vst %%v17,80(%%r1,%[y])\n\t" + "vst %%v18,96(%%r1,%[y])\n\t" + "vst %%v19,112(%%r1,%[y])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", + "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", + "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 50ff18646..5a46aec1c 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -29,16 +29,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const FLOAT (*)[n * 2]) x) - : "cc"); + "0:\n\t" + "pfd 1, 1024(%[x])\n\t" + "pfd 2, 1024(%[y])\n\t" + "mvc 0(256,%[y]),0(%[x])\n\t" + "la %[x],256(%[x])\n\t" + "la %[y],256(%[y])\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), + [n] "+&r"(n) + : "m"(*(const struct { FLOAT x[n * 2]; } *) x) + : "cc"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index 7a67ef734..ac6e69c23 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -29,76 +29,76 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v25,%%v25,%%v27\n\t" - "vfadb %%v25,%%v25,%%v29\n\t" - "vfadb %%v25,%%v25,%%v31\n\t" - "vsteg %%v24,0(%[d]),0\n\t" - "vsteg %%v24,8(%[d]),1\n\t" - "vsteg %%v25,16(%[d]),1\n\t" - "vsteg %%v25,24(%[d]),0" - : "=m"(*(FLOAT (*)[4]) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[n * 2]) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vzero %%v25\n\t" + "vzero %%v26\n\t" + "vzero %%v27\n\t" + "vzero %%v28\n\t" + "vzero %%v29\n\t" + "vzero %%v30\n\t" + "vzero %%v31\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 1, 1024(%%r1,%[x])\n\t" + "pfd 1, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "vl %%v16, 64(%%r1,%[x])\n\t" + "vl %%v17, 80(%%r1,%[x])\n\t" + "vl %%v18, 96(%%r1,%[x])\n\t" + "vl %%v19, 112(%%r1,%[x])\n\t" + "vl %%v0, 64(%%r1,%[y])\n\t" + "vl %%v1, 80(%%r1,%[y])\n\t" + "vl %%v2, 96(%%r1,%[y])\n\t" + "vl %%v3, 112(%%r1,%[y])\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" + "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" + "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" + "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" + "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" + "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" + "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" + "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v24,%%v24,%%v26\n\t" + "vfadb %%v24,%%v24,%%v28\n\t" + "vfadb %%v24,%%v24,%%v30\n\t" + "vfadb %%v25,%%v25,%%v27\n\t" + "vfadb %%v25,%%v25,%%v29\n\t" + "vfadb %%v25,%%v25,%%v31\n\t" + "vsteg %%v24,0(%[d]),0\n\t" + "vsteg %%v24,8(%[d]),1\n\t" + "vsteg %%v25,16(%[d]),1\n\t" + "vsteg %%v25,24(%[d]),0" + : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) + : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 7f21985ec..5ca8da3c1 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -30,235 +30,243 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define NBMAX 1024 static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" + "vl %%v18,32(%[x])\n\t" + "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%[x]),0\n\t" - "wflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[x]),1\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "wflcdb %%v22,%%v22\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vleg %%v23,56(%[x]),0\n\t" - "wflcdb %%v23,%%v23\n\t" - "vleg %%v23,48(%[x]),1\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "wflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "wflcdb %%v22,%%v22\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vleg %%v23,56(%[x]),0\n\t" + "wflcdb %%v23,%%v23\n\t" + "vleg %%v23,48(%[x]),1\n\t" #else - "vleg %%v20,0(%[x]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,8(%[x]),0\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vflcdb %%v21,%%v21\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vflcdb %%v22,%%v22\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "vleg %%v23,48(%[x]),1\n\t" - "vflcdb %%v23,%%v23\n\t" - "vleg %%v23,56(%[x]),0\n\t" + "vleg %%v20,0(%[x]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,8(%[x]),0\n\t" + "vleg %%v21,16(%[x]),1\n\t" + "vflcdb %%v21,%%v21\n\t" + "vleg %%v21,24(%[x]),0\n\t" + "vleg %%v22,32(%[x]),1\n\t" + "vflcdb %%v22,%%v22\n\t" + "vleg %%v22,40(%[x]),0\n\t" + "vleg %%v23,48(%[x]),1\n\t" + "vflcdb %%v23,%%v23\n\t" + "vleg %%v23,56(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap0])\n\t" - "vlrepg %%v29,24(%%r1,%[ap0])\n\t" - "vlrepg %%v30,16(%%r1,%[ap1])\n\t" - "vlrepg %%v31,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" - "vlrepg %%v24,0(%%r1,%[ap2])\n\t" - "vlrepg %%v25,8(%%r1,%[ap2])\n\t" - "vlrepg %%v26,0(%%r1,%[ap3])\n\t" - "vlrepg %%v27,8(%%r1,%[ap3])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[8]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap0])\n\t" + "vlrepg %%v29,24(%%r1,%[ap0])\n\t" + "vlrepg %%v30,16(%%r1,%[ap1])\n\t" + "vlrepg %%v31,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" + "vlrepg %%v24,0(%%r1,%[ap2])\n\t" + "vlrepg %%v25,8(%%r1,%[ap2])\n\t" + "vlrepg %%v26,0(%%r1,%[ap3])\n\t" + "vlrepg %%v27,8(%%r1,%[ap3])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" + "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" + "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" + "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" + "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" + "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%[x]),0\n\t" - "wflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[x]),1\n\t" - "vleg %%v19,24(%[x]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,16(%[x]),1\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "wflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vleg %%v19,24(%[x]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,16(%[x]),1\n\t" #else - "vleg %%v18,0(%[x]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,8(%[x]),0\n\t" - "vleg %%v19,16(%[x]),1\n\t" - "vflcdb %%v19,%%v19\n\t" - "vleg %%v19,24(%[x]),0\n\t" + "vleg %%v18,0(%[x]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,8(%[x]),0\n\t" + "vleg %%v19,16(%[x]),1\n\t" + "vflcdb %%v19,%%v19\n\t" + "vleg %%v19,24(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[4]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" + "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" + "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" + "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" + "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { __asm__("vl %%v16,0(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%[x]),0\n\t" - "wflcdb %%v17,%%v17\n\t" - "vleg %%v17,0(%[x]),1\n\t" + "vleg %%v17,8(%[x]),0\n\t" + "wflcdb %%v17,%%v17\n\t" + "vleg %%v17,0(%[x]),1\n\t" #else - "vleg %%v17,0(%[x]),1\n\t" - "vflcdb %%v17,%%v17\n\t" - "vleg %%v17,8(%[x]),0\n\t" + "vleg %%v17,0(%[x]),1\n\t" + "vflcdb %%v17,%%v17\n\t" + "vleg %%v17,8(%[x]),0\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vlrepg %%v20,16(%%r1,%[ap])\n\t" - "vlrepg %%v21,24(%%r1,%[ap])\n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[2]) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 2,1024(%%r1,%[y])\n\t" + "vl %%v0,0(%%r1,%[y])\n\t" + "vl %%v1,16(%%r1,%[y])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vlrepg %%v20,16(%%r1,%[ap])\n\t" + "vlrepg %%v21,24(%%r1,%[ap])\n\t" + "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" + "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" + "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" + "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" + "vst %%v0,0(%%r1,%[y])\n\t" + "vst %%v1,16(%%r1,%[y])\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); } static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, FLOAT alpha_i) { __asm__( #if !defined(XCONJ) - "vlrepg %%v0,%[alpha_r]\n\t" - "vleg %%v1,%[alpha_i],0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,%[alpha_i],1\n\t" + "vlrepg %%v0,%[alpha_r]\n\t" + "vleg %%v1,%[alpha_i],0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,%[alpha_i],1\n\t" #else - "vleg %%v0,%[alpha_r],1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,%[alpha_r],0\n\t" - "vlrepg %%v1,%[alpha_i]\n\t" + "vleg %%v0,%[alpha_r],1\n\t" + "vflcdb %%v0,%%v0\n\t" + "vleg %%v0,%[alpha_r],0\n\t" + "vlrepg %%v1,%[alpha_i]\n\t" #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const FLOAT (*)[n * 2]) src),[src] "a"(src), - [alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],2\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[src])\n\t" + "pfd 2,1024(%%r1,%[dest])\n\t" + "vl %%v16,0(%%r1,%[src])\n\t" + "vl %%v17,16(%%r1,%[src])\n\t" + "vl %%v18,32(%%r1,%[src])\n\t" + "vl %%v19,48(%%r1,%[src])\n\t" + "vl %%v20,0(%%r1,%[dest])\n\t" + "vl %%v21,16(%%r1,%[dest])\n\t" + "vl %%v22,32(%%r1,%[dest])\n\t" + "vl %%v23,48(%%r1,%[dest])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" + "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" + "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" + "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" + "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" + "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" + "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" + "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" + "vst %%v28,0(%%r1,%[dest])\n\t" + "vst %%v29,16(%%r1,%[dest])\n\t" + "vst %%v30,32(%%r1,%[dest])\n\t" + "vst %%v31,48(%%r1,%[dest])\n\t" + "agfi %%r1,64\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) + : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), + [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 7b3e6c1fc..031c31e29 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -31,266 +31,274 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + register FLOAT *ap2 = ap[2]; + register FLOAT *ap3 = ap[3]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vzero %%v20\n\t" - "vzero %%v21\n\t" - "vzero %%v22\n\t" - "vzero %%v23\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "vzero %%v20\n\t" + "vzero %%v21\n\t" + "vzero %%v22\n\t" + "vzero %%v23\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[ap2])\n\t" + "pfd 1,1024(%%r1,%[ap3])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,0(%%r1,%[ap2])\n\t" - "vlrepg %%v29,8(%%r1,%[ap2])\n\t" - "vlrepg %%v30,0(%%r1,%[ap3])\n\t" - "vlrepg %%v31,8(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v24,0(%%r1,%[ap0])\n\t" + "vlrepg %%v25,8(%%r1,%[ap0])\n\t" + "vlrepg %%v26,0(%%r1,%[ap1])\n\t" + "vlrepg %%v27,8(%%r1,%[ap1])\n\t" + "vlrepg %%v28,0(%%r1,%[ap2])\n\t" + "vlrepg %%v29,8(%%r1,%[ap2])\n\t" + "vlrepg %%v30,0(%%r1,%[ap3])\n\t" + "vlrepg %%v31,8(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" + "vlrepg %%v24,16(%%r1,%[ap0])\n\t" + "vlrepg %%v25,24(%%r1,%[ap0])\n\t" + "vlrepg %%v26,16(%%r1,%[ap1])\n\t" + "vlrepg %%v27,24(%%r1,%[ap1])\n\t" + "vlrepg %%v28,16(%%r1,%[ap2])\n\t" + "vlrepg %%v29,24(%%r1,%[ap2])\n\t" + "vlrepg %%v30,16(%%r1,%[ap3])\n\t" + "vlrepg %%v31,24(%%r1,%[ap3])\n\t" + "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" + "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" + "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" + "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" + "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" + "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" + "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v20\n\t" + "vfadb %%v17,%%v17,%%v21\n\t" + "vfadb %%v18,%%v18,%%v22\n\t" + "vfadb %%v19,%%v19,%%v23\n\t" + "vpdi %%v20,%%v16,%%v16,4\n\t" + "vpdi %%v21,%%v17,%%v17,4\n\t" + "vpdi %%v22,%%v18,%%v18,4\n\t" + "vpdi %%v23,%%v19,%%v19,4\n\t" #if !defined(XCONJ) - "vlrepg %%v24,0(%[alpha])\n\t" - "vleg %%v25,8(%[alpha]),0\n\t" - "wflcdb %%v25,%%v25\n\t" - "vleg %%v25,8(%[alpha]),1\n\t" + "vlrepg %%v24,0(%[alpha])\n\t" + "vleg %%v25,8(%[alpha]),0\n\t" + "wflcdb %%v25,%%v25\n\t" + "vleg %%v25,8(%[alpha]),1\n\t" #else - "vleg %%v24,0(%[alpha]),1\n\t" - "vflcdb %%v24,%%v24\n\t" - "vleg %%v24,0(%[alpha]),0\n\t" - "vlrepg %%v25,8(%[alpha])\n\t" + "vleg %%v24,0(%[alpha]),1\n\t" + "vflcdb %%v24,%%v24\n\t" + "vleg %%v24,0(%[alpha]),0\n\t" + "vlrepg %%v25,8(%[alpha])\n\t" #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" - "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" - "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" - "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" - "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" - "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" - "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" - "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" - : "+m"(*(FLOAT (*)[8]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) ap[2]),[ap2] "a"(ap[2]), - "m"(*(const FLOAT (*)[n * 2]) ap[3]),[ap3] "a"(ap[3]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vl %%v26,0(%[y])\n\t" + "vl %%v27,16(%[y])\n\t" + "vl %%v28,32(%[y])\n\t" + "vl %%v29,48(%[y])\n\t" + "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" + "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" + "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" + "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" + "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" + "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" + "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" + "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" + "vst %%v26,0(%[y])\n\t" + "vst %%v27,16(%[y])\n\t" + "vst %%v28,32(%[y])\n\t" + "vst %%v29,48(%[y])" + : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { + register FLOAT *ap0 = ap[0]; + register FLOAT *ap1 = ap[1]; + __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "vzero %%v18\n\t" + "vzero %%v19\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap0])\n\t" + "pfd 1,1024(%%r1,%[ap1])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v20,0(%%r1,%[ap0])\n\t" + "vlrepg %%v21,8(%%r1,%[ap0])\n\t" + "vlrepg %%v22,0(%%r1,%[ap1])\n\t" + "vlrepg %%v23,8(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v20,16(%%r1,%[ap0])\n\t" - "vlrepg %%v21,24(%%r1,%[ap0])\n\t" - "vlrepg %%v22,16(%%r1,%[ap1])\n\t" - "vlrepg %%v23,24(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" - "vpdi %%v18,%%v16,%%v16,4\n\t" - "vpdi %%v19,%%v17,%%v17,4\n\t" + "vlrepg %%v20,16(%%r1,%[ap0])\n\t" + "vlrepg %%v21,24(%%r1,%[ap0])\n\t" + "vlrepg %%v22,16(%%r1,%[ap1])\n\t" + "vlrepg %%v23,24(%%r1,%[ap1])\n\t" + "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" + "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" + "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" + "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v18\n\t" + "vfadb %%v17,%%v17,%%v19\n\t" + "vpdi %%v18,%%v16,%%v16,4\n\t" + "vpdi %%v19,%%v17,%%v17,4\n\t" #if !defined(XCONJ) - "vlrepg %%v20,0(%[alpha])\n\t" - "vleg %%v21,8(%[alpha]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,8(%[alpha]),1\n\t" + "vlrepg %%v20,0(%[alpha])\n\t" + "vleg %%v21,8(%[alpha]),0\n\t" + "wflcdb %%v21,%%v21\n\t" + "vleg %%v21,8(%[alpha]),1\n\t" #else - "vleg %%v20,0(%[alpha]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[alpha]),0\n\t" - "vlrepg %%v21,8(%[alpha])\n\t" + "vleg %%v20,0(%[alpha]),1\n\t" + "vflcdb %%v20,%%v20\n\t" + "vleg %%v20,0(%[alpha]),0\n\t" + "vlrepg %%v21,8(%[alpha])\n\t" #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" - "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" - "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" - "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" - "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" - : "+m"(*(FLOAT (*)[4]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap[0]),[ap0] "a"(ap[0]), - "m"(*(const FLOAT (*)[n * 2]) ap[1]),[ap1] "a"(ap[1]), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); + "vl %%v22,0(%[y])\n\t" + "vl %%v23,16(%[y])\n\t" + "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" + "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" + "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" + "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" + "vst %%v22,0(%[y])\n\t" + "vst %%v23,16(%[y])\n\t" + : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), + "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23"); } static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) { __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" + "vzero %%v17\n\t" + "xgr %%r1,%%r1\n\t" + "srlg %[n],%[n],1\n\t" + "0:\n\t" + "pfd 1,1024(%%r1,%[ap])\n\t" + "pfd 1,1024(%%r1,%[x])\n\t" + "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + "vleg %%v1,0(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + "vlrepg %%v18,0(%%r1,%[ap])\n\t" + "vlrepg %%v19,8(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "vl %%v0,16(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + "vleg %%v1,16(%%r1,%[x]),1\n\t" + "vflcdb %%v1,%%v1\n\t" + "vleg %%v1,24(%%r1,%[x]),0\n\t" #endif - "vlrepg %%v18,16(%%r1,%[ap])\n\t" - "vlrepg %%v19,24(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vpdi %%v17,%%v16,%%v16,4\n\t" + "vlrepg %%v18,16(%%r1,%[ap])\n\t" + "vlrepg %%v19,24(%%r1,%[ap])\n\t" + "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" + "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" + "agfi %%r1,32\n\t" + "brctg %[n],0b\n\t" + "vfadb %%v16,%%v16,%%v17\n\t" + "vpdi %%v17,%%v16,%%v16,4\n\t" #if !defined(XCONJ) - "vlrepg %%v18,0(%[alpha])\n\t" - "vleg %%v19,8(%[alpha]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,8(%[alpha]),1\n\t" + "vlrepg %%v18,0(%[alpha])\n\t" + "vleg %%v19,8(%[alpha]),0\n\t" + "wflcdb %%v19,%%v19\n\t" + "vleg %%v19,8(%[alpha]),1\n\t" #else - "vleg %%v18,0(%[alpha]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[alpha]),0\n\t" - "vlrepg %%v19,8(%[alpha])\n\t" + "vleg %%v18,0(%[alpha]),1\n\t" + "vflcdb %%v18,%%v18\n\t" + "vleg %%v18,0(%[alpha]),0\n\t" + "vlrepg %%v19,8(%[alpha])\n\t" #endif - "vl %%v0,0(%[y])\n\t" - "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" - "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" - : "+m"(*(FLOAT (*)[2]) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const FLOAT (*)[n * 2]) ap),[ap] "a"(ap), - "m"(*(const FLOAT (*)[n * 2]) x),[x] "a"(x), - "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); + "vl %%v0,0(%[y])\n\t" + "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" + "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" + "vst %%v0,0(%[y])\n\t" + : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) + : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), + "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), + "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); } static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index aa7f16605..6284d5a47 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -29,151 +29,152 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "m"(*c),[s] "m"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vlrepg %%v1,%[s]\n\t" + "srlg %[n],%[n],4\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v24, 0(%%r1,%[x])\n\t" + "vl %%v25, 16(%%r1,%[x])\n\t" + "vl %%v26, 32(%%r1,%[x])\n\t" + "vl %%v27, 48(%%r1,%[x])\n\t" + "vl %%v16, 0(%%r1,%[y])\n\t" + "vl %%v17, 16(%%r1,%[y])\n\t" + "vl %%v18, 32(%%r1,%[y])\n\t" + "vl %%v19, 48(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 0(%%r1,%[x])\n\t" + "vst %%v29, 16(%%r1,%[x])\n\t" + "vst %%v30, 32(%%r1,%[x])\n\t" + "vst %%v31, 48(%%r1,%[x])\n\t" + "vst %%v20, 0(%%r1,%[y])\n\t" + "vst %%v21, 16(%%r1,%[y])\n\t" + "vst %%v22, 32(%%r1,%[y])\n\t" + "vst %%v23, 48(%%r1,%[y])\n\t" + "vl %%v24, 64(%%r1,%[x])\n\t" + "vl %%v25, 80(%%r1,%[x])\n\t" + "vl %%v26, 96(%%r1,%[x])\n\t" + "vl %%v27, 112(%%r1,%[x])\n\t" + "vl %%v16, 64(%%r1,%[y])\n\t" + "vl %%v17, 80(%%r1,%[y])\n\t" + "vl %%v18, 96(%%r1,%[y])\n\t" + "vl %%v19, 112(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 64(%%r1,%[x])\n\t" + "vst %%v29, 80(%%r1,%[x])\n\t" + "vst %%v30, 96(%%r1,%[x])\n\t" + "vst %%v31, 112(%%r1,%[x])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v16, 128(%%r1,%[y])\n\t" + "vl %%v17, 144(%%r1,%[y])\n\t" + "vl %%v18, 160(%%r1,%[y])\n\t" + "vl %%v19, 176(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 128(%%r1,%[x])\n\t" + "vst %%v29, 144(%%r1,%[x])\n\t" + "vst %%v30, 160(%%r1,%[x])\n\t" + "vst %%v31, 176(%%r1,%[x])\n\t" + "vst %%v20, 128(%%r1,%[y])\n\t" + "vst %%v21, 144(%%r1,%[y])\n\t" + "vst %%v22, 160(%%r1,%[y])\n\t" + "vst %%v23, 176(%%r1,%[y])\n\t" + "vl %%v24, 192(%%r1,%[x])\n\t" + "vl %%v25, 208(%%r1,%[x])\n\t" + "vl %%v26, 224(%%r1,%[x])\n\t" + "vl %%v27, 240(%%r1,%[x])\n\t" + "vl %%v16, 192(%%r1,%[y])\n\t" + "vl %%v17, 208(%%r1,%[y])\n\t" + "vl %%v18, 224(%%r1,%[y])\n\t" + "vl %%v19, 240(%%r1,%[y])\n\t" + "vfmdb %%v28,%%v24,%%v0\n\t" + "vfmdb %%v29,%%v25,%%v0\n\t" + "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0\n\t" + "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0\n\t" + "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ + /* 2nd parts */ + "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ + "vst %%v28, 192(%%r1,%[x])\n\t" + "vst %%v29, 208(%%r1,%[x])\n\t" + "vst %%v30, 224(%%r1,%[x])\n\t" + "vst %%v31, 240(%%r1,%[x])\n\t" + "vst %%v20, 192(%%r1,%[y])\n\t" + "vst %%v21, 208(%%r1,%[y])\n\t" + "vst %%v22, 224(%%r1,%[y])\n\t" + "vst %%v23, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index fbcc0c5b9..e497a6d7b 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -29,167 +29,170 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + "vleg %%v1,8(%[alpha]),0\n\t" + "wflcdb %%v1,%%v1\n\t" + "vleg %%v1,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v24,%%v16,%%v16,4\n\t" + "vpdi %%v25,%%v17,%%v17,4\n\t" + "vpdi %%v26,%%v18,%%v18,4\n\t" + "vpdi %%v27,%%v19,%%v19,4\n\t" + "vpdi %%v28,%%v20,%%v20,4\n\t" + "vpdi %%v29,%%v21,%%v21,4\n\t" + "vpdi %%v30,%%v22,%%v22,4\n\t" + "vpdi %%v31,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" + "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" + "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" + "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" + "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" + "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" + "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" + "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", + "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", + "v31"); } static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vleg %%v0,8(%[alpha]),0\n\t" - "wflcdb %%v0,%%v0\n\t" - "vleg %%v0,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v16,%%v16,%%v16,4\n\t" - "vpdi %%v17,%%v17,%%v17,4\n\t" - "vpdi %%v18,%%v18,%%v18,4\n\t" - "vpdi %%v19,%%v19,%%v19,4\n\t" - "vpdi %%v20,%%v20,%%v20,4\n\t" - "vpdi %%v21,%%v21,%%v21,4\n\t" - "vpdi %%v22,%%v22,%%v22,4\n\t" - "vpdi %%v23,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "wflcdb %%v0,%%v0\n\t" + "vleg %%v0,8(%[alpha]),1\n\t" + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vpdi %%v16,%%v16,%%v16,4\n\t" + "vpdi %%v17,%%v17,%%v17,4\n\t" + "vpdi %%v18,%%v18,%%v18,4\n\t" + "vpdi %%v19,%%v19,%%v19,4\n\t" + "vpdi %%v20,%%v20,%%v20,4\n\t" + "vpdi %%v21,%%v21,%%v21,4\n\t" + "vpdi %%v22,%%v22,%%v22,4\n\t" + "vpdi %%v23,%%v23,%%v23,4\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const FLOAT (*)[2]) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vl %%v16,0(%%r1,%[x])\n\t" + "vl %%v17,16(%%r1,%[x])\n\t" + "vl %%v18,32(%%r1,%[x])\n\t" + "vl %%v19,48(%%r1,%[x])\n\t" + "vl %%v20,64(%%r1,%[x])\n\t" + "vl %%v21,80(%%r1,%[x])\n\t" + "vl %%v22,96(%%r1,%[x])\n\t" + "vl %%v23,112(%%r1,%[x])\n\t" + "vfmdb %%v16,%%v16,%%v0\n\t" + "vfmdb %%v17,%%v17,%%v0\n\t" + "vfmdb %%v18,%%v18,%%v0\n\t" + "vfmdb %%v19,%%v19,%%v0\n\t" + "vfmdb %%v20,%%v20,%%v0\n\t" + "vfmdb %%v21,%%v21,%%v0\n\t" + "vfmdb %%v22,%%v22,%%v0\n\t" + "vfmdb %%v23,%%v23,%%v0\n\t" + "vst %%v16,0(%%r1,%[x])\n\t" + "vst %%v17,16(%%r1,%[x])\n\t" + "vst %%v18,32(%%r1,%[x])\n\t" + "vst %%v19,48(%%r1,%[x])\n\t" + "vst %%v20,64(%%r1,%[x])\n\t" + "vst %%v21,80(%%r1,%[x])\n\t" + "vst %%v22,96(%%r1,%[x])\n\t" + "vst %%v23,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), + [alpha] "a"(alpha) + : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", + "v23"); } static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(FLOAT (*)[n * 2]) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + "srlg %[n],%[n],3\n\t" + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "vst %%v0,0(%%r1,%[x])\n\t" + "vst %%v0,16(%%r1,%[x])\n\t" + "vst %%v0,32(%%r1,%[x])\n\t" + "vst %%v0,48(%%r1,%[x])\n\t" + "vst %%v0,64(%%r1,%[x])\n\t" + "vst %%v0,80(%%r1,%[x])\n\t" + "vst %%v0,96(%%r1,%[x])\n\t" + "vst %%v0,112(%%r1,%[x])\n\t" + "agfi %%r1,128\n\t" + "brctg %[n],0b" + : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) + : [x] "a"(x) + : "cc", "r1", "v0"); } static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index 0f38103be..bc466866c 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -29,81 +29,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { __asm__("srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(FLOAT (*)[n * 2]) x), "+m"(*(FLOAT (*)[n * 2]) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + "xgr %%r1,%%r1\n\t" + "0:\n\t" + "pfd 2, 1024(%%r1,%[x])\n\t" + "pfd 2, 1024(%%r1,%[y])\n\t" + "vl %%v16, 0(%%r1,%[x])\n\t" + "vl %%v17, 16(%%r1,%[x])\n\t" + "vl %%v18, 32(%%r1,%[x])\n\t" + "vl %%v19, 48(%%r1,%[x])\n\t" + "vl %%v20, 64(%%r1,%[x])\n\t" + "vl %%v21, 80(%%r1,%[x])\n\t" + "vl %%v22, 96(%%r1,%[x])\n\t" + "vl %%v23, 112(%%r1,%[x])\n\t" + "vl %%v24, 128(%%r1,%[x])\n\t" + "vl %%v25, 144(%%r1,%[x])\n\t" + "vl %%v26, 160(%%r1,%[x])\n\t" + "vl %%v27, 176(%%r1,%[x])\n\t" + "vl %%v28, 192(%%r1,%[x])\n\t" + "vl %%v29, 208(%%r1,%[x])\n\t" + "vl %%v30, 224(%%r1,%[x])\n\t" + "vl %%v31, 240(%%r1,%[x])\n\t" + "vl %%v0, 0(%%r1,%[y])\n\t" + "vl %%v1, 16(%%r1,%[y])\n\t" + "vl %%v2, 32(%%r1,%[y])\n\t" + "vl %%v3, 48(%%r1,%[y])\n\t" + "vl %%v4, 64(%%r1,%[y])\n\t" + "vl %%v5, 80(%%r1,%[y])\n\t" + "vl %%v6, 96(%%r1,%[y])\n\t" + "vl %%v7, 112(%%r1,%[y])\n\t" + "vst %%v0, 0(%%r1,%[x])\n\t" + "vst %%v1, 16(%%r1,%[x])\n\t" + "vst %%v2, 32(%%r1,%[x])\n\t" + "vst %%v3, 48(%%r1,%[x])\n\t" + "vst %%v4, 64(%%r1,%[x])\n\t" + "vst %%v5, 80(%%r1,%[x])\n\t" + "vst %%v6, 96(%%r1,%[x])\n\t" + "vst %%v7, 112(%%r1,%[x])\n\t" + "vl %%v0, 128(%%r1,%[y])\n\t" + "vl %%v1, 144(%%r1,%[y])\n\t" + "vl %%v2, 160(%%r1,%[y])\n\t" + "vl %%v3, 176(%%r1,%[y])\n\t" + "vl %%v4, 192(%%r1,%[y])\n\t" + "vl %%v5, 208(%%r1,%[y])\n\t" + "vl %%v6, 224(%%r1,%[y])\n\t" + "vl %%v7, 240(%%r1,%[y])\n\t" + "vst %%v0, 128(%%r1,%[x])\n\t" + "vst %%v1, 144(%%r1,%[x])\n\t" + "vst %%v2, 160(%%r1,%[x])\n\t" + "vst %%v3, 176(%%r1,%[x])\n\t" + "vst %%v4, 192(%%r1,%[x])\n\t" + "vst %%v5, 208(%%r1,%[x])\n\t" + "vst %%v6, 224(%%r1,%[x])\n\t" + "vst %%v7, 240(%%r1,%[x])\n\t" + "vst %%v16, 0(%%r1,%[y])\n\t" + "vst %%v17, 16(%%r1,%[y])\n\t" + "vst %%v18, 32(%%r1,%[y])\n\t" + "vst %%v19, 48(%%r1,%[y])\n\t" + "vst %%v20, 64(%%r1,%[y])\n\t" + "vst %%v21, 80(%%r1,%[y])\n\t" + "vst %%v22, 96(%%r1,%[y])\n\t" + "vst %%v23, 112(%%r1,%[y])\n\t" + "vst %%v24, 128(%%r1,%[y])\n\t" + "vst %%v25, 144(%%r1,%[y])\n\t" + "vst %%v26, 160(%%r1,%[y])\n\t" + "vst %%v27, 176(%%r1,%[y])\n\t" + "vst %%v28, 192(%%r1,%[y])\n\t" + "vst %%v29, 208(%%r1,%[y])\n\t" + "vst %%v30, 224(%%r1,%[y])\n\t" + "vst %%v31, 240(%%r1,%[y])\n\t" + "agfi %%r1,256\n\t" + "brctg %[n],0b" + : "+m"(*(struct { FLOAT x[n * 2]; } *) x), + "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) + : [x] "a"(x),[y] "a"(y) + : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28", "v29", "v30", "v31"); } int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, From f5836741092ca3f9358c2a24c6056bf098b3f748 Mon Sep 17 00:00:00 2001 From: maamountki Date: Tue, 12 Feb 2019 13:12:28 +0200 Subject: [PATCH 076/133] [ZARCH] Fix cgemv_t_4 --- kernel/zarch/cgemv_t_4.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c index 91ea1c10c..e10edfab0 100644 --- a/kernel/zarch/cgemv_t_4.c +++ b/kernel/zarch/cgemv_t_4.c @@ -120,10 +120,10 @@ static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" + "vfasb %%v16,%%v16,%%v20\n\t" + "vfasb %%v17,%%v17,%%v21\n\t" + "vfasb %%v18,%%v18,%%v22\n\t" + "vfasb %%v19,%%v19,%%v23\n\t" "vrepg %%v20,%%v16,1\n\t" "vrepg %%v21,%%v17,1\n\t" "vrepg %%v22,%%v18,1\n\t" @@ -244,8 +244,8 @@ static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, "vfmasb %%v19,%%v23,%%v1,%%v19\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" + "vfasb %%v16,%%v16,%%v18\n\t" + "vfasb %%v17,%%v17,%%v19\n\t" "vrepg %%v18,%%v16,1\n\t" "vrepg %%v19,%%v17,1\n\t" "vfasb %%v16,%%v16,%%v18\n\t" @@ -342,7 +342,7 @@ static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, "vfmasb %%v17,%%v19,%%v1,%%v17\n\t" "agfi %%r1,16\n\t" "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" + "vfasb %%v16,%%v16,%%v17\n\t" "vrepg %%v17,%%v16,1\n\t" "vfasb %%v16,%%v16,%%v17\n\t" "verllg %%v17,%%v16,32\n\t" From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:33:48 +0100 Subject: [PATCH 077/133] Fix declaration of input arguments in the x86_64 s/dGEMV_T and s/dGEMV_N kernels Arguments 0 and 1 need to be tagged as both input and output --- kernel/x86_64/dgemv_n_4.c | 10 +++++----- kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- kernel/x86_64/sgemv_n_4.c | 14 +++++++------- kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d2530e81..6d33641e9 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index a7478e3a8..ed672a757 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movsd %%xmm11,8(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movsd %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 65305ac59..63697970f 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "3: \n\t" : + "+r" (i), // 0 + "+r" (n1) // 1 : - "r" (i), // 0 - "r" (n1), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 @@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "jnz 1b \n\t" : + "+r" (i), // 0 + "+r" (n) // 1 : - "r" (i), // 0 - "r" (n), // 1 "r" (src), // 2 "r" (dest) // 3 : "cc", diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 065e5b385..86ecaf516 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movss %%xmm11,4(%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movss %%xmm10, (%2) \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 15:51:43 +0100 Subject: [PATCH 078/133] Fix declaration of input arguments in inline assembly Argument 0 is modified as it doubles as a counter --- kernel/x86_64/dscal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index ef9a0a6ba..d0d7801fd 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : + "+r" (n) // 0 : - "r" (n), // 0 "r" (x), // 1 "r" (x1), // 2 "r" (alpha), // 3 From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:00:18 +0100 Subject: [PATCH 079/133] Fix declaration of assembly arguments in SSYMV and DSYMV microkernels Arguments 0 and 1 are both input and output --- kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- 8 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index d7166fe4b..ae287b6d8 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c index d83d20f8e..4778f644a 100644 --- a/kernel/x86_64/dsymv_U_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 1344c75f7..065182286 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c index 1ef6fbafd..d84e703bd 100644 --- a/kernel/x86_64/dsymv_U_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index 8c01ab806..4a4f4d68d 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c index a32e59b44..e6a09ccf8 100644 --- a/kernel/x86_64/ssymv_U_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index b8e6ee732..c56ff3b15 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c index e8650650c..c4919a39a 100644 --- a/kernel/x86_64/ssymv_U_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - : - "r" (i), // 0 - "r" (n), // 1 + "+r" (i), // 0 + "+r" (n) // 1 + : "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Feb 2019 16:14:02 +0100 Subject: [PATCH 080/133] Fix declaration of arguments in inline assembly Argument 0 is modified so should be input and output --- kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- 8 files changed, 18 insertions(+), 18 deletions(-) diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index d84470cc4..bfa07b6d0 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 866782ee6..6241879d5 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index 38479f77a..a161dcd8b 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c index b4e6ab369..b205b1019 100644 --- a/kernel/x86_64/dsymv_L_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index 9002228f3..602c3edf2 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c index 69db008b6..fdfe4349a 100644 --- a/kernel/x86_64/ssymv_L_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index c0fe5d640..6bb9c02f6 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c index 093ca8073..0c78212e7 100644 --- a/kernel/x86_64/ssymv_L_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 @@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - : - "r" (from), // 0 + "+r" (from) // 0 + : "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 From bec54ae366ebce932b6bd6bdc89d4e585a0da798 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 12:54:35 +0200 Subject: [PATCH 081/133] [ZARCH] Fix caxpy --- kernel/zarch/caxpy.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c index e4b484ab7..14a124ae2 100644 --- a/kernel/zarch/caxpy.c +++ b/kernel/zarch/caxpy.c @@ -65,6 +65,14 @@ static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { "vl %%v21,80(%%r1,%[y])\n\t" "vl %%v22,96(%%r1,%[y])\n\t" "vl %%v23,112(%%r1,%[y])\n\t" + "verllg %%v24,%%v8,32\n\t" + "verllg %%v25,%%v9,32\n\t" + "verllg %%v26,%%v10,32\n\t" + "verllg %%v27,%%v11,32\n\t" + "verllg %%v28,%%v16,32\n\t" + "verllg %%v29,%%v17,32\n\t" + "verllg %%v30,%%v18,32\n\t" + "verllg %%v31,%%v19,32\n\t" "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" From 0a54c98b9d9a6ad8364297bbef0eea4b000a92f0 Mon Sep 17 00:00:00 2001 From: maamountki Date: Wed, 13 Feb 2019 21:06:25 +0200 Subject: [PATCH 082/133] [ZARCH] Modify constraints --- kernel/zarch/cgemv_n_4.c | 2 +- kernel/zarch/zgemv_n_4.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c index adba05d47..5c36bc338 100644 --- a/kernel/zarch/cgemv_n_4.c +++ b/kernel/zarch/cgemv_n_4.c @@ -352,7 +352,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23"); } diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 5ca8da3c1..13045a359 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -263,7 +263,7 @@ static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, "brctg %[n],0b" : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "m"(alpha_r),[alpha_i] "m"(alpha_i) + [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); From f9d67bb5e8e895fd5fe7e36e43febef7aa06ef35 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:06:41 +0100 Subject: [PATCH 083/133] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq) presuming typo by K.Goto --- kernel/power/gemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 62d7761ec..7acc05b4d 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -129,7 +129,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From 718efcec6fb6d45d5dd461ed47b26f49c2c4e77d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Feb 2019 22:08:37 +0100 Subject: [PATCH 084/133] Fix out-of-bounds memory access in gemm_beta Fixes #2011 (as suggested by davemq), assuming typo by K.Goto --- kernel/power/zgemm_beta.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 43b72ca15..1f4c29210 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -134,7 +134,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbst PRE, CO1 + dcbtst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 From b55c586faca28863db16a2148b69aaa37aaa797e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 15:21:36 +0100 Subject: [PATCH 085/133] Fix missing clobber in x86/x86_64 blas_quickdivide inline assembly function (#2017) * Fix missing clobber in blas_quickdivide assembly --- common_x86.h | 2 +- common_x86_64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common_x86.h b/common_x86.h index 4f538c948..3fdffe2a8 100644 --- a/common_x86.h +++ b/common_x86.h @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); return result; #endif diff --git a/common_x86_64.h b/common_x86_64.h index f27c1e9be..718a81050 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); return result; } From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 From: Bart Oldeman Date: Thu, 14 Feb 2019 16:19:41 +0000 Subject: [PATCH 086/133] dgemv_kernel_4x4(Haswell): add missing clobbers for xmm0,xmm1,xmm2,xmm3 This fixes a crash in dblat2 when OpenBLAS is compiled using -march=znver1 -ftree-vectorize -O2 See also: https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 --- kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index 584a6c6b5..da0fa2fff 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:43:18 +0100 Subject: [PATCH 087/133] Save and restore input argument 8 (lda4) Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 2c90f8aa9..e89a16785 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); @@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vbroadcastss (%9), %%ymm6 \n\t" // alpha + "movq %8, %%xmm10 \n\t" //save lda + "testq $0x04, %1 \n\t" "jz 2f \n\t" @@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "4: \n\t" "vzeroupper \n\t" + "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 @@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", + "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); @@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO } - #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha + "testq $0x04, %1 \n\t" "jz 2f \n\t" From adb419ed67cb6b3c416a7e6babdd28390cefe37d Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 14 Feb 2019 22:57:30 +0100 Subject: [PATCH 088/133] With the Intel compiler on Linux, prefer ifort for the final link step icc has known problems with mixed-language builds that ifort can handle just fine. Fixes #1956 --- exports/Makefile | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 3a5f77db3..b1348bd4a 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -141,6 +141,14 @@ else $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif + +ifeq ($(F_COMPILER), INTEL) + $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ + -Wl,--whole-archive $< -Wl,--no-whole-archive \ + -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) + $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +else + ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -152,6 +160,7 @@ else -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. +endif endif rm -f linktest From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 10:10:04 +0100 Subject: [PATCH 089/133] Rename operands to put lda on the input/output constraint list --- kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ 1 file changed, 61 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index e89a16785..93e1e26e8 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha - "movq %8, %%xmm10 \n\t" //save lda - "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz 3f \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" @@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y - "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y - - "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" - - "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" "addq $16, %0 \n\t" - "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" - "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" - "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" - "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" - "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" + "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" + "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $16, %8 \n\t" - "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y + "addq $16, %2 \n\t" + "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" - "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y "jnz 1b \n\t" "4: \n\t" "vzeroupper \n\t" - "movq %%xmm10, %8 \n\t" //restore lda : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", @@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", - "%xmm10", "%xmm12", "%xmm13", "%xmm14", "%xmm15", "memory" ); From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 15 Feb 2019 15:08:16 +0100 Subject: [PATCH 090/133] Fix wrong constraints in inline assembly for #2009 --- kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index fcab8e2c7..9ab78fc8e 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " cmpq $0, %0 \n\t" " je 4f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " addq $8, %1 \n\t" @@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .p2align 4 \n\t" "1: \n\t" - " vmovups (%2,%1,4), %%ymm4 \n\t" // read a + " vmovups (%8,%1,4), %%ymm4 \n\t" // read a " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" - " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 22f \n\t" - " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + " vmovups (%8,%1,4), %%ymm0 \n\t" // read a " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" - " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" - " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 + " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" @@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" - " vmovups (%9), %%ymm0 \n\t" + " vmovups (%3), %%ymm0 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" @@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" - " vmovups 32(%9), %%ymm4 \n\t" + " vmovups 32(%3), %%ymm4 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" @@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "5: \n\t" // i = 0 - " addq $64, %9 \n\t" // b=b+8 + " addq $64, %3 \n\t" // b=b+8 " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups %%ymm8 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups %%ymm8 , (%2) \n\t" // write a " vmovups %%ymm8 , (%4) \n\t" // write c " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" @@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm9 , (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm9 , (%2) \n\t" // write a " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" @@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb - " vmovups (%9), %%ymm0 \n\t" - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm10, (%8) \n\t" // write a + " vmovups (%3), %%ymm0 \n\t" + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm10, (%2) \n\t" // write a " vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" @@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm11, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm11, (%2) \n\t" // write a " vmovups %%ymm11, (%5) \n\t" // write c " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" @@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm12, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm12, (%2) \n\t" // write a " vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" @@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm13, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm13, (%2) \n\t" // write a " vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" @@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - " addq $64, %9 \n\t" // b=b+8 - " addq $32, %8 \n\t" // a=a+8 + " addq $64, %3 \n\t" // b=b+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb - " vmovups 32(%9), %%ymm1 \n\t" - " vmovups %%ymm14, (%8) \n\t" // write a + " vmovups 32(%3), %%ymm1 \n\t" + " vmovups %%ymm14, (%2) \n\t" // write a " vmovups %%ymm14, (%6) \n\t" // write c " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - " addq $32, %8 \n\t" // a=a+8 + " addq $32, %2 \n\t" // a=a+8 " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb - " vmovups %%ymm15, (%8) \n\t" // write a + " vmovups %%ymm15, (%2) \n\t" // write a " vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c3), // 5 "r" (c6), // 6 "r" (ldc), // 7 - "r" (as), // 8 - "r" (bs) // 9 + "r" (a), // 8 + "r" (b) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From f209fc7fa90a583e60ff2c667821d39ae0efbe70 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 12:12:39 +0100 Subject: [PATCH 091/133] Update Makefile.rule add note about NUM_THREADS for package maintainers, add examples of programs that cause affinity troubles --- Makefile.rule | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index faf34c0a1..bba3d1588 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,10 +72,16 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the script. +# automatically detected by the the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. +# Users may opt at runtime to use less than NUM_THREADS threads. +# +# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS +# value (eg. 32-256) if you expect your users to use that many threads. Due to the way +# some internal structures are allocated, using a large NUM_THREADS value has a RAM +# footprint penalty, even if users reduce the actual number of threads at runtime. # NUM_THREADS = 24 # If you have enabled USE_OPENMP and your application would call @@ -138,6 +144,7 @@ NO_WARMUP = 1 # to the same core(s) as OpenBLAS, possibly binding all threads to a single core. # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing # else modifies affinity settings. +# Note: enabling affinity has been known to cause problems with NumPy and R NO_AFFINITY = 1 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:24:11 +0100 Subject: [PATCH 092/133] Fix inline assembly constraints rework indices to allow marking argument lda4 as input and output. For #2009 --- kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index 11a3e943b..d21232bfa 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "movss (%2), %%xmm12 \n\t" // x0 - "movss 4(%2), %%xmm13 \n\t" // x1 - "movss 8(%2), %%xmm14 \n\t" // x2 - "movss 12(%2), %%xmm15 \n\t" // x3 + "movss (%3), %%xmm12 \n\t" // x0 + "movss 4(%3), %%xmm13 \n\t" // x1 + "movss 8(%3), %%xmm14 \n\t" // x2 + "movss 12(%3), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" - "movss 16(%2), %%xmm0 \n\t" // x4 - "movss 20(%2), %%xmm1 \n\t" // x5 - "movss 24(%2), %%xmm2 \n\t" // x6 - "movss 28(%2), %%xmm3 \n\t" // x7 + "movss 16(%3), %%xmm0 \n\t" // x4 + "movss 20(%3), %%xmm1 \n\t" // x5 + "movss 24(%3), %%xmm2 \n\t" // x6 + "movss 28(%3), %%xmm3 \n\t" // x7 "shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t" @@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y ".p2align 1 \n\t" - "movups (%4,%0,4), %%xmm8 \n\t" - "movups (%5,%0,4), %%xmm9 \n\t" - "movups (%6,%0,4), %%xmm10 \n\t" - "movups (%7,%0,4), %%xmm11 \n\t" + "movups (%5,%0,4), %%xmm8 \n\t" + "movups (%6,%0,4), %%xmm9 \n\t" + "movups (%7,%0,4), %%xmm10 \n\t" + "movups (%8,%0,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" @@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "movups (%4,%8,4), %%xmm8 \n\t" - "movups (%5,%8,4), %%xmm9 \n\t" - "movups (%6,%8,4), %%xmm10 \n\t" - "movups (%7,%8,4), %%xmm11 \n\t" + "movups (%5,%2,4), %%xmm8 \n\t" + "movups (%6,%2,4), %%xmm9 \n\t" + "movups (%7,%2,4), %%xmm10 \n\t" + "movups (%8,%2,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" @@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addps %%xmm5 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "mulps %%xmm6 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm4 , %%xmm7 \n\t" - "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y + "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:36:39 +0100 Subject: [PATCH 093/133] Fix inline assembly constraints rework indices to allow marking argument lda as input and output. --- kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- 1 file changed, 65 insertions(+), 65 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index b35daa35b..3fc46542b 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%2), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 + "vbroadcastss (%3), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha @@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" - "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" - "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" - "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" - "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" + "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" - "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" - "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" - "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" + "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" @@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - "addq $4, %8 \n\t" + "addq $4, %2 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" @@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - "addq $8, %8 \n\t" + "addq $8, %2 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" - "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" - "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%4,%8,4) \n\t" - "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" - "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" - "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" + "prefetcht0 192(%5,%2,4) \n\t" + "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" - "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" - "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" @@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y - "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - "addq $16, %8 \n\t" + "addq $16, %2 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:46:17 +0100 Subject: [PATCH 094/133] Fix inline assembly constraints --- kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index 31001c7f3..bbf06c84b 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "vbroadcastss (%2), %%xmm12 \n\t" // x0 - "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 - "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 - "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 - "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 - "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 - "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 - "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 + "vbroadcastss (%3), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 "vbroadcastss (%9), %%xmm8 \n\t" // alpha @@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" "addq $4 , %0 \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" - "addq $4 , %8 \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %2 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" - "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "subq $4 , %1 \n\t" - "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y "2: \n\t" @@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y "addq $8 , %0 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "subq $8 , %1 \n\t" @@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - "prefetcht0 192(%4,%0,4) \n\t" - "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" + "prefetcht0 192(%8,%0,4) \n\t" + "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" ".align 2 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" - - "prefetcht0 192(%4,%8,4) \n\t" - "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" - "prefetcht0 192(%5,%8,4) \n\t" - "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" - "prefetcht0 192(%6,%8,4) \n\t" - "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" - "prefetcht0 192(%7,%8,4) \n\t" - "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%5,%2,4) \n\t" + "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%6,%2,4) \n\t" + "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%7,%2,4) \n\t" + "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%8,%2,4) \n\t" + "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" - "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" + "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "addq $16, %0 \n\t" - "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y - "addq $16, %8 \n\t" - "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y - "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y + "addq $16, %2 \n\t" + "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 18:51:09 +0100 Subject: [PATCH 095/133] Fix inline assembly constraints --- dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ 1 file changed, 247 insertions(+) create mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c new file mode 100644 index 000000000..466931b82 --- /dev/null +++ b/dgemv_n_microk_piledriver-4.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + + +#define HAVE_KERNEL_4x8 1 +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 + + "vbroadcastsd (%9), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y + + "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "addq $8 , %0 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "addq $8 , %2 \n\t" + "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y + "subq $8 , %1 \n\t" + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y + + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n), // 1 + "+r" (lda4) // 2 + : + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + + +#define HAVE_KERNEL_4x4 1 +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG register i = 0; + + __asm__ __volatile__ + ( + "vzeroupper \n\t" + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + + "vbroadcastsd (%8), %%ymm6 \n\t" // alpha + + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + + "2: \n\t" + + "cmpq $0, %1 \n\t" + "je 3f \n\t" + + + ".align 16 \n\t" + "1: \n\t" + "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" + + "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y + + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + "jnz 1b \n\t" + + "3: \n\t" + "vzeroupper \n\t" + + : + "+r" (i), // 0 + "+r" (n) // 1 + : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); + +} + + From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 16 Feb 2019 20:06:48 +0100 Subject: [PATCH 096/133] Fix inline assembly constraints in Bulldozer TRSM kernels rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 --- kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- 5 files changed, 356 insertions(+), 356 deletions(-) diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c index 54df5b359..35ed4cc01 100644 --- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c @@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " prefetcht0 384(%3,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " prefetcht0 384(%7,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%2,%1,8) \n\t" - " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b - " vmovddup 8(%3,%1,2), %%xmm1 \n\t" - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%6,%1,8) \n\t" + " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%7,%1,2), %%xmm1 \n\t" + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vmovddup (%7), %%xmm1 \n\t" // read b - " vmovddup 8(%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm1 \n\t" // read b + " vmovddup 8(%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $16 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 8 + " subq $16 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 8 - " vmovddup (%7), %%xmm0 \n\t" // read bb + " vmovddup (%3), %%xmm0 \n\t" // read bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c index 1b8991c6c..3cd215000 100644 --- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c @@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %6 \n\t" // a -= m - " subq $8 , %7 \n\t" // b -= n + " subq $64 , %2 \n\t" // a -= m + " subq $8 , %3 \n\t" // b -= n - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c index 0623dddb0..a4a62491c 100644 --- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] + " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] + " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] + " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] + " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] + " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] + " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] + " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] + " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] + " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] + " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %6 \n\t" // a -= m - " addq $8 , %7 \n\t" // b -= n + " addq $64 , %2 \n\t" // a -= m + " addq $8 , %3 \n\t" // b -= n - " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c index 4cc557d55..c11c84cec 100644 --- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 0 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb - " vbroadcastss 4(%7), %%xmm1 \n\t" // read b + " vbroadcastss (%3), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm1 \n\t" // read b " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" " \n\t" // i = 1 - " addq $8 , %7 \n\t" // b = b + 2 - " addq $64 , %6 \n\t" // a = a + 16 + " addq $8 , %3 \n\t" // b = b + 2 + " addq $64 , %2 \n\t" // a = a + 16 - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c index 73f6e8a95..326ca2976 100644 --- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c @@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b - " vmovups (%2,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" - " vmovups 16(%2,%1,8), %%xmm5 \n\t" - " vmovups 32(%2,%1,8), %%xmm6 \n\t" - " vmovups 48(%2,%1,8), %%xmm7 \n\t" + " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b + " vmovups (%6,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" + " vmovups 16(%6,%1,8), %%xmm5 \n\t" + " vmovups 32(%6,%1,8), %%xmm6 \n\t" + " vmovups 48(%6,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vbroadcastss (%7), %%xmm1 \n\t" // read b - " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm1 \n\t" // read b + " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%6) \n\t" // write a - " vmovups %%xmm13 , 16(%6) \n\t" // write a - " vmovups %%xmm14 , 32(%6) \n\t" // write a - " vmovups %%xmm15 , 48(%6) \n\t" // write a + " vmovups %%xmm12 , (%2) \n\t" // write a + " vmovups %%xmm13 , 16(%2) \n\t" // write a + " vmovups %%xmm14 , 32(%2) \n\t" // write a + " vmovups %%xmm15 , 48(%2) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $8 , %7 \n\t" // b = b - 2 - " subq $64 , %6 \n\t" // a = a - 16 + " subq $8 , %3 \n\t" // b = b - 2 + " subq $64 , %2 \n\t" // a = a - 16 - " vbroadcastss (%7), %%xmm0 \n\t" // read bb + " vbroadcastss (%3), %%xmm0 \n\t" // read bb " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%6) \n\t" // write a - " vmovups %%xmm9 , 16(%6) \n\t" - " vmovups %%xmm10 , 32(%6) \n\t" - " vmovups %%xmm11 , 48(%6) \n\t" + " vmovups %%xmm8 , (%2) \n\t" // write a + " vmovups %%xmm9 , 16(%2) \n\t" + " vmovups %%xmm10 , 32(%2) \n\t" + " vmovups %%xmm11 , 48(%2) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : + "+r" (n1), // 0 + "+a" (i), // 1 + "+r" (as), // 2 + "+r" (bs) // 3 : - "r" (n1), // 0 - "a" (i), // 1 - "r" (a), // 2 - "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 - "r" (as), // 6 - "r" (bs) // 7 + "r" (c), // 4 + "r" (c1), // 5 + "r" (a), // 6 + "r" (b) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", From 56089991e2305ce692482186825c44c89a535518 Mon Sep 17 00:00:00 2001 From: TiborGY Date: Sat, 16 Feb 2019 23:26:13 +0100 Subject: [PATCH 097/133] fix the the --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index bba3d1588..91f42e396 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -72,7 +72,7 @@ VERSION = 0.3.6.dev # You can define the maximum number of threads. Basically it should be less # than or equal to the number of CPU threads. If you don't specify one, it's -# automatically detected by the the build system. +# automatically detected by the build system. # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to # restrict NUM_THREADS to the number of physical cores. By default, the automatic # detection includes logical CPUs, thus allowing the use of SMT. From 78d9910236739e98a16244679bbd814f1d79ca7f Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 20:59:48 +0100 Subject: [PATCH 098/133] Correct range_n limiting same bug as seen in #1388, somehow missed in corresponding PR #1389 --- driver/level2/trmv_thread.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 24b881a93..00092e956 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -346,8 +346,8 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + } queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -386,8 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m) range_n[num_cpu] = m; - + if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From e29b0cfcc439b1598ba26486763b3cfa46583a9e Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 21:03:30 +0100 Subject: [PATCH 099/133] Allow multithreading TRMV again revert workaround introduced for issue #1332 as the actual cause appears to be my incorrect fix from #1262 (see #1388) --- interface/trmv.c | 5 +---- interface/ztrmv.c | 3 --- 2 files changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/trmv.c b/interface/trmv.c index 7c40ae976..2e52527a3 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP -/* nthreads = num_cpu_avail(2); + nthreads = num_cpu_avail(2); -FIXME trmv_thread was found to be broken, see issue 1332 */ - nthreads = 1; - if (nthreads == 1) { #endif diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 0e16632e0..4c47e9e91 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; -/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ - nthreads = 1; - if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } From 45333d57931ddc64fb3e8a091e0616dd9528cef1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 19 Feb 2019 22:16:33 +0100 Subject: [PATCH 100/133] Fix error introduced during cleanup --- driver/level2/trmv_thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 00092e956..43eeb40d2 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -347,7 +347,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; - } + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; @@ -387,6 +387,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; queue[num_cpu].args = &args; From 343b301d14875a17ff4357bd98bea29d0df70741 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 20 Feb 2019 10:27:48 +0100 Subject: [PATCH 101/133] Reduce list of kernels in the dynamic arch build to make compilation complete reliably within the 1h limit again --- appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/appveyor.yml b/appveyor.yml index 741c66291..44a616aaa 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -55,7 +55,7 @@ before_build: - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. + - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. build_script: - cmake --build . From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 24 Feb 2019 20:41:02 +0200 Subject: [PATCH 103/133] move fix to right place --- dgemv_n_microk_piledriver-4.c | 247 -------------------- kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- 2 files changed, 49 insertions(+), 296 deletions(-) delete mode 100644 dgemv_n_microk_piledriver-4.c diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c deleted file mode 100644 index 466931b82..000000000 --- a/dgemv_n_microk_piledriver-4.c +++ /dev/null @@ -1,247 +0,0 @@ -/*************************************************************************** -Copyright (c) 2014, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - - -#define HAVE_KERNEL_4x8 1 -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%3), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - - "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - - "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" - "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 - : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - - -#define HAVE_KERNEL_4x4 1 -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG register i = 0; - - __asm__ __volatile__ - ( - "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - - "vbroadcastsd (%8), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - - "2: \n\t" - - "cmpq $0, %1 \n\t" - "je 3f \n\t" - - - ".align 16 \n\t" - "1: \n\t" - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - - "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y - "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y - - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - "jnz 1b \n\t" - - "3: \n\t" - "vzeroupper \n\t" - - : - "+r" (i), // 0 - "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); - -} - - diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 530780bab..466931b82 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastsd (%2), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 + "vbroadcastsd (%3), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y + "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - "addq $4 , %8 \n\t" + "addq $4 , %2 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" "addq $8 , %0 \n\t" - "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $8 , %8 \n\t" + "addq $8 , %2 \n\t" "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y "jnz 1b \n\t" @@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n) // 1 + "+r" (n), // 1 + "+r" (lda4) // 2 : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 - "r" (ap[1]), // 5 - "r" (ap[2]), // 6 - "r" (ap[3]), // 7 - "r" (lda4), // 8 + "r" (x), // 3 + "r" (y), // 4 + "r" (ap[0]), // 5 + "r" (ap[1]), // 6 + "r" (ap[2]), // 7 + "r" (ap[3]), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", From 918a0cc4d1548617478f925c8341461c055268e5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:55:36 +0100 Subject: [PATCH 104/133] Fix missing -c option in AVX512 test --- c_check | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/c_check b/c_check index 38f9170ca..d93b756d5 100644 --- a/c_check +++ b/c_check @@ -232,7 +232,7 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { ($fh,$tmpf) = tempfile( UNLINK => 1 ); $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); system(@cmd) == 0; if ($? != 0) { From fd34820b99bd302ed2b31ca0e5fedeb492a179c7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 25 Feb 2019 17:58:31 +0100 Subject: [PATCH 105/133] Fix AVX512 test always returning false due to missing compiler option --- cmake/system_check.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 6b602c1b0..88bb081a6 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -78,7 +78,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() From d66214c94628bb2050b2ab83361d1ac54d3373b5 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 09:58:25 +0100 Subject: [PATCH 106/133] Make x86_32 imply NO_AVX2, NO_AVX512 in addition to NO_AVX fixes #2033 --- Makefile.system | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.system b/Makefile.system index 67c8cd197..bbd777448 100644 --- a/Makefile.system +++ b/Makefile.system @@ -155,7 +155,7 @@ GETARCH_FLAGS += -DNO_AVX endif ifeq ($(BINARY), 32) -GETARCH_FLAGS += -DNO_AVX +GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 endif ifeq ($(NO_AVX2), 1) From 2ffb72718787bea52f7958d2fe5b91c489cd2aee Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Thu, 28 Feb 2019 10:51:54 +0100 Subject: [PATCH 107/133] Keep xcode8.3 for osx BINARY=32 build as xcode10 deprecated i386 --- .travis.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.travis.yml b/.travis.yml index ec5dc8a9b..eee7674fe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -160,6 +160,7 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos + osx_image: xcode8.3 env: - BTYPE="BINARY=32" From c4868d11c02f1ac97e71afdef3dc49429678959b Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Fri, 1 Mar 2019 09:23:03 +0100 Subject: [PATCH 108/133] Make sure that AVX512 is disabled in 32bit builds for #2033 --- Makefile.system | 1 + 1 file changed, 1 insertion(+) diff --git a/Makefile.system b/Makefile.system index bbd777448..53f89b2fa 100644 --- a/Makefile.system +++ b/Makefile.system @@ -156,6 +156,7 @@ endif ifeq ($(BINARY), 32) GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 +NO_AVX512 = 1 endif ifeq ($(NO_AVX2), 1) From 25427926bc8b74a48e335ae05c56cbfd8d0187b9 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 2 Mar 2019 23:36:36 +0100 Subject: [PATCH 109/133] Improve handling of NO_STATIC and NO_SHARED to avoid surprises from defining either as zero. Fixes #2035 by addressing some concerns from #1422 --- Makefile | 2 +- Makefile.install | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 21096f893..273fde33e 100644 --- a/Makefile +++ b/Makefile @@ -96,7 +96,7 @@ endif @echo shared : -ifndef NO_SHARED +ifneq ($(NO_SHARED), 1) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so diff --git a/Makefile.install b/Makefile.install index 069c96c6a..fefecd98d 100644 --- a/Makefile.install +++ b/Makefile.install @@ -58,14 +58,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @@ -106,14 +106,14 @@ ifndef NO_LAPACKE endif #for install static library -ifndef NO_STATIC +ifneq ($(NO_STATIC),1) @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -138,7 +138,7 @@ endif @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" -ifndef NO_SHARED +ifneq ($(NO_SHARED),1) #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" From e4a79be6bb9fac2ba18d820d83bc7bf9173a63c2 Mon Sep 17 00:00:00 2001 From: Andrew <16061801+brada4@users.noreply.github.com> Date: Sun, 3 Mar 2019 09:05:11 +0200 Subject: [PATCH 111/133] address warning introed with #1814 et al --- driver/others/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 09851f15c..c30ca71cb 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2584,7 +2584,7 @@ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos; + int mypos = 0; #endif void *map_address; From af480b02a4a45df377acf9be0d6078609bb345c2 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 3 Mar 2019 14:17:07 +0100 Subject: [PATCH 112/133] Restore locking optimizations for OpenMP case restore another accidentally dropped part of #1468 that was missed in #2004 to address performance regression reported in #1461 --- driver/others/memory.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index 2e185593e..a40cb442a 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -2647,21 +2647,26 @@ void *blas_memory_alloc(int procpos){ position = 0; +#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); +#endif do { -/* if (!memory[position].used) { */ -/* blas_lock(&memory[position].lock);*/ - +#if defined(USE_OPENMP) + if (!memory[position].used) { + blas_lock(&memory[position].lock); +#endif if (!memory[position].used) goto allocation; -/* blas_unlock(&memory[position].lock);*/ -/* } */ - +#if defined(USE_OPENMP) + blas_unlock(&memory[position].lock); + } +#endif position ++; } while (position < NUM_BUFFERS); - UNLOCK_COMMAND(&alloc_lock); - +#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); +#endif goto error; allocation : @@ -2671,9 +2676,11 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; - +#if defined(SMP) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); - +#else + blas_unlock(&memory[position].lock); +#endif if (!memory[position].addr) { do { #ifdef DEBUG From 783ba8058fbc6d5f0a56d27bc368b659448b1fb1 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:30:50 +0800 Subject: [PATCH 113/133] HiSilicon tsv110 CPUs optimization branch add HiSilicon tsv110 CPUs optimization branch --- kernel/arm64/KERNEL.TSV110 | 175 +++++++++++++++++++++++++++++++++++++ 1 file changed, 175 insertions(+) create mode 100644 kernel/arm64/KERNEL.TSV110 diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 new file mode 100644 index 000000000..04d6940d7 --- /dev/null +++ b/kernel/arm64/KERNEL.TSV110 @@ -0,0 +1,175 @@ +SAMINKERNEL = ../arm/amin.c +DAMINKERNEL = ../arm/amin.c +CAMINKERNEL = ../arm/zamin.c +ZAMINKERNEL = ../arm/zamin.c + +SMAXKERNEL = ../arm/max.c +DMAXKERNEL = ../arm/max.c + +SMINKERNEL = ../arm/min.c +DMINKERNEL = ../arm/min.c + +ISAMINKERNEL = ../arm/iamin.c +IDAMINKERNEL = ../arm/iamin.c +ICAMINKERNEL = ../arm/izamin.c +IZAMINKERNEL = ../arm/izamin.c + +ISMAXKERNEL = ../arm/imax.c +IDMAXKERNEL = ../arm/imax.c + +ISMINKERNEL = ../arm/imin.c +IDMINKERNEL = ../arm/imin.c + +STRMMKERNEL = ../generic/trmmkernel_4x4.c +DTRMMKERNEL = ../generic/trmmkernel_2x2.c +CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c +ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +SAMAXKERNEL = amax.S +DAMAXKERNEL = amax.S +CAMAXKERNEL = zamax.S +ZAMAXKERNEL = zamax.S + +ISAMAXKERNEL = iamax.S +IDAMAXKERNEL = iamax.S +ICAMAXKERNEL = izamax.S +IZAMAXKERNEL = izamax.S + +SASUMKERNEL = asum.S +DASUMKERNEL = asum.S +CASUMKERNEL = casum.S +ZASUMKERNEL = zasum.S + +SAXPYKERNEL = axpy.S +DAXPYKERNEL = axpy.S +CAXPYKERNEL = zaxpy.S +ZAXPYKERNEL = zaxpy.S + +SCOPYKERNEL = copy.S +DCOPYKERNEL = copy.S +CCOPYKERNEL = copy.S +ZCOPYKERNEL = copy.S + +SDOTKERNEL = dot.S +DDOTKERNEL = dot.S +CDOTKERNEL = zdot.S +ZDOTKERNEL = zdot.S +DSDOTKERNEL = dot.S + +SNRM2KERNEL = nrm2.S +DNRM2KERNEL = nrm2.S +CNRM2KERNEL = znrm2.S +ZNRM2KERNEL = znrm2.S + +SROTKERNEL = rot.S +DROTKERNEL = rot.S +CROTKERNEL = zrot.S +ZROTKERNEL = zrot.S + +SSCALKERNEL = scal.S +DSCALKERNEL = scal.S +CSCALKERNEL = zscal.S +ZSCALKERNEL = zscal.S + +SSWAPKERNEL = swap.S +DSWAPKERNEL = swap.S +CSWAPKERNEL = swap.S +ZSWAPKERNEL = swap.S + +SGEMVNKERNEL = gemv_n.S +DGEMVNKERNEL = gemv_n.S +CGEMVNKERNEL = zgemv_n.S +ZGEMVNKERNEL = zgemv_n.S + +SGEMVTKERNEL = gemv_t.S +DGEMVTKERNEL = gemv_t.S +CGEMVTKERNEL = zgemv_t.S +ZGEMVTKERNEL = zgemv_t.S + +SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S +ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S +DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S + +ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) + +ifeq ($(DGEMM_UNROLL_M), 8) +DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S +DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S +else +DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c +DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c +endif + +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif + +ifeq ($(DGEMM_UNROLL_N), 4) +DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S +DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S +else +DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c +DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c +endif + +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S +ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) +CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c +CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c +CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) + +ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S +ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) +ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c +ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + From 53f482ee72e56b31ace7860199c8fb3027af5303 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:41:21 +0800 Subject: [PATCH 114/133] add TARGET support for HiSilicon tsv110 CPUs --- Makefile.arm64 | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Makefile.arm64 b/Makefile.arm64 index cd16dbfae..4d10ff684 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif + +ifeq ($(CORE), TSV110) +CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 +endif From 760842dda1fd8f0475216b46ca25fc016f671d05 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:45:22 +0800 Subject: [PATCH 115/133] add TARGET support for HiSilicon tsv110 CPUs --- getarch.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/getarch.c b/getarch.c index 242d08004..ac58c8226 100644 --- a/getarch.c +++ b/getarch.c @@ -1065,6 +1065,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_TSV110 +#define FORCE +#define ARCHITECTURE "ARM64" +#define SUBARCHITECTURE "TSV110" +#define SUBDIRNAME "arm64" +#define ARCHCONFIG "-DTSV110 " \ + "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ + "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" +#define LIBNAME "tsv110" +#define CORENAME "TSV110" +#else +#endif + + #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" From fb4dae71240be9ad1e55792a46b38f8e107cb70a Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Mon, 4 Mar 2019 16:48:49 +0800 Subject: [PATCH 116/133] add TARGET support for HiSilicon tsv110 CPUs --- TargetList.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/TargetList.txt b/TargetList.txt index 3a5a32234..aebd0dd18 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -90,6 +90,7 @@ CORTEXA73 FALKOR THUNDERX THUNDERX2T99 +TSV110 9.System Z: ZARCH_GENERIC From e4864a8933f6875bbb434887dc9120dbcf6be4dd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Mon, 4 Mar 2019 21:17:08 +0100 Subject: [PATCH 117/133] Fix module definition conflicts between LAPACK and ReLAPACK for #2043 --- CMakeLists.txt | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9de894f9c..a27c1c0fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -75,10 +75,10 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) - list(APPEND SUBDIRS lapack) if(BUILD_RELAPACK) list(APPEND SUBDIRS relapack/src) endif() + list(APPEND SUBDIRS lapack) endif () # set which float types we want to build for @@ -224,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) +if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) + if (NOT MSVC) + target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") + else() + target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") + endif() +endif() + if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") From 11cfd0bd75a1ce8714ca3abf6867d3f45548dab1 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 5 Mar 2019 16:04:25 +0100 Subject: [PATCH 118/133] Do not compile in AVX512 check if AVX support is disabled xgetbv is function depends on NO_AVX being undefined - we could change that too, but that combo is unlikely to work anyway --- driver/others/dynamic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254ac..46dfaea6c 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -322,7 +322,7 @@ int support_avx2(){ } int support_avx512(){ -#ifndef NO_AVX512 +#if !defined(NO_AVX) && !defined(NO_AVX512) int eax, ebx, ecx, edx; int ret=0; From 4290afdae247337261b5ca0ea76e5bfcad2cc4a9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Wed, 6 Mar 2019 20:55:06 -0800 Subject: [PATCH 119/133] ctest.c : add __POWERPC__ for PowerMac --- ctest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ctest.c b/ctest.c index 0571e9e02..5e869b901 100644 --- a/ctest.c +++ b/ctest.c @@ -113,7 +113,7 @@ ARCH_X86 ARCH_X86_64 #endif -#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) ARCH_POWER #endif From b7f59da42d3978234e7e6ed293365b66f340189d Mon Sep 17 00:00:00 2001 From: Celelibi Date: Thu, 7 Mar 2019 16:39:41 +0100 Subject: [PATCH 120/133] Fix crash in sgemm SSE/nano kernel on x86_64 Fix bug #2047. Signed-off-by: Celelibi --- kernel/x86_64/gemm_kernel_4x8_nano.S | 2 +- kernel/x86_64/gemm_kernel_8x4_sse.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S index 074562804..e29520fa1 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nano.S +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -135,7 +135,7 @@ #endif movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S index c4ef1f809..1602c13c5 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -383,7 +383,7 @@ EMMS movq %rsp, %rbx # save old stack - subq $128 + LOCAL_BUFFER_SIZE, %rsp + subq $256 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING From b0c714ef602095c764b58c0a9ba68fddd9008c73 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:36:35 -0800 Subject: [PATCH 121/133] param.h : enable defines for PPC970 on DarwinOS fixes: gemm.c: In function 'sgemm_': ../common_param.h:981:18: error: 'SGEMM_DEFAULT_P' undeclared (first use in this function) #define SGEMM_P SGEMM_DEFAULT_P ^ --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b54..48b7ef383 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#ifdef OS_LINUX +#if defined(OS_LINUX) || defined(OS_DARWIN) #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 From f7a06463d9a0db120cc530a3298f3290855ccbe9 Mon Sep 17 00:00:00 2001 From: ken-cunningham-webuse Date: Thu, 7 Mar 2019 11:41:58 -0800 Subject: [PATCH 122/133] common_power.h: force DCBT_ARG 0 on PPC970 Darwin without this, we see ../kernel/power/gemv_n.S:427:Parameter syntax error and many more similar entries that relates to this assembly command dcbt 8, r24, r18 this change makes the DCBT_ARG = 0 and openblas builds through to completion on PowerMac 970 Tests pass --- common_power.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_power.h b/common_power.h index e3a1a7aef..68087b071 100644 --- a/common_power.h +++ b/common_power.h @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || ( defined(PPC970) && defined(OS_DARWIN) ) #define DCBT_ARG 0 #else #define DCBT_ARG 8 From 5b95534afcc80d54f51bd766b617fd3f494ec65a Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sat, 9 Mar 2019 11:21:16 +0100 Subject: [PATCH 123/133] Make TARGET=GENERIC compatible with DYNAMIC_ARCH=1 for issue #2048 --- kernel/Makefile.L3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index eafcfb1b4..bf5fffe86 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(TARGET), GENERIC) +ifeq ($(CORE), GENERIC) USE_TRMM = 1 endif From f074d7d1463c15bbf838b2305f259160281dead3 Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:05:19 +0800 Subject: [PATCH 124/133] make DYNAMIC_ARCH=1 package work on TSV110. --- cpuid_arm64.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/cpuid_arm64.c b/cpuid_arm64.c index 5077d7b11..a5e731d74 100644 --- a/cpuid_arm64.c +++ b/cpuid_arm64.c @@ -39,6 +39,8 @@ // Cavium #define CPU_THUNDERX 7 #define CPU_THUNDERX2T99 8 +//Hisilicon +#define CPU_TSV110 9 static char *cpuname[] = { "UNKNOWN", @@ -49,7 +51,8 @@ static char *cpuname[] = { "CORTEXA73", "FALKOR", "THUNDERX", - "THUNDERX2T99" + "THUNDERX2T99", + "TSV110" }; static char *cpuname_lower[] = { @@ -61,7 +64,8 @@ static char *cpuname_lower[] = { "cortexa73", "falkor", "thunderx", - "thunderx2t99" + "thunderx2t99", + "tsv110" }; int get_feature(char *search) @@ -145,6 +149,9 @@ int detect(void) return CPU_THUNDERX; else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) return CPU_THUNDERX2T99; + // HiSilicon + else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) + return CPU_TSV110; } p = (char *) NULL ; @@ -286,6 +293,21 @@ void get_cpuconfig(void) printf("#define DTB_DEFAULT_ENTRIES 64 \n"); printf("#define DTB_SIZE 4096 \n"); break; + + case CPU_TSV110: + printf("#define TSV110 \n"); + printf("#define L1_CODE_SIZE 65536 \n"); + printf("#define L1_CODE_LINESIZE 64 \n"); + printf("#define L1_CODE_ASSOCIATIVE 4 \n"); + printf("#define L1_DATA_SIZE 65536 \n"); + printf("#define L1_DATA_LINESIZE 64 \n"); + printf("#define L1_DATA_ASSOCIATIVE 4 \n"); + printf("#define L2_SIZE 524228 \n"); + printf("#define L2_LINESIZE 64 \n"); + printf("#define L2_ASSOCIATIVE 8 \n"); + printf("#define DTB_DEFAULT_ENTRIES 64 \n"); + printf("#define DTB_SIZE 4096 \n"); + break; } } From 7e3eb9b25d26ca9be337acf0b0fd2c647e353e0c Mon Sep 17 00:00:00 2001 From: maomao194313 Date: Tue, 12 Mar 2019 16:11:01 +0800 Subject: [PATCH 125/133] make DYNAMIC_ARCH=1 package work on TSV110 --- param.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/param.h b/param.h index 3cc400b54..79fb05380 100644 --- a/param.h +++ b/param.h @@ -2591,7 +2591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) + defined(FALKOR) || defined(TSV110) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 From b1393c7a97e2da1b64e1f779bdf68b7af0924543 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:03:56 +0100 Subject: [PATCH 126/133] Add Intel Denverton for #2048 --- cpuid_x86.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index c45ddd968..884d4b78a 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1359,6 +1359,8 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: // Apollo Lake + case 15: + // Denverton return CPUTYPE_NEHALEM; } break; @@ -1376,9 +1378,9 @@ int get_cpuname(void){ } break; case 9: - case 8: + case 8: switch (model) { - case 14: // Kaby Lake + case 14: // Kaby Lake and refreshes if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) From 04f2226ea6edd95decf888b67bbdd4a8de530b54 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Tue, 12 Mar 2019 16:09:55 +0100 Subject: [PATCH 127/133] Add Intel Denverton --- driver/others/dynamic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 99c9254ac..895bacb50 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Apollo Lake - if (model == 12) { + //Apollo Lake or Denverton + if (model == 12 || model == 15) { return &gotoblas_NEHALEM; } return NULL; From c3e30b2bc2234dfafc9e674c8ab5723fabeb04c5 Mon Sep 17 00:00:00 2001 From: Sacha Date: Wed, 13 Mar 2019 23:21:54 +1000 Subject: [PATCH 128/133] Change 64-bit detection as explained in #2056 --- cmake/system_check.cmake | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 88bb081a6..f30a946b4 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - set(X86_64 1) + if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") + set(X86_64 1) + else() + set(X86 1) + endif() elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") From 4fc17d0d754b7905667fb84a68cf37a0d28a93bd Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Wed, 13 Mar 2019 19:20:23 +0100 Subject: [PATCH 129/133] Trivial typo fix as suggested in #2022 --- Makefile.rule | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile.rule b/Makefile.rule index 91f42e396..8f72c5a79 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -199,7 +199,7 @@ NO_AFFINITY = 1 # been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 -# If you need santy check by comparing reference BLAS. It'll be very +# If you need sanity check by comparing results to reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 From 1006ff8a7bc4ee77150d6f13483838c96789e3fc Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Fri, 15 Mar 2019 15:06:30 +0100 Subject: [PATCH 130/133] Use POSIX getenv on Cygwin The Windows-native GetEnvironmentVariable cannot be relied on, as Cygwin does not always copy environment variables set through Cygwin to the Windows environment block, particularly after fork(). --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index 7fcd5e316..f239c3d78 100644 --- a/common.h +++ b/common.h @@ -439,7 +439,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 typedef char env_var_t[MAX_PATH]; #define readenv(p, n) 0 #else -#ifdef OS_WINDOWS +#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else From 4ad694eda1ff79040778648d44cda5b8f774c38d Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Mon, 18 Mar 2019 20:32:48 +0100 Subject: [PATCH 131/133] Fix for #2063: The DllMain used in Cygwin did not run the thread memory pool cleanup upon THREAD_DETACH which is needed when compiled with USE_TLS=1. --- driver/others/memory.c | 11 +++++++++-- exports/dllinit.c | 24 +++++++++++++++++------- 2 files changed, 26 insertions(+), 9 deletions(-) diff --git a/driver/others/memory.c b/driver/others/memory.c index ed407a858..ac8545f35 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1313,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { free(map_address); } +#ifdef SMP +void blas_thread_memory_cleanup(void) { + blas_memory_cleanup((void*)get_memory_table()); +} +#endif + + void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); @@ -1322,7 +1329,7 @@ void blas_shutdown(void){ /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) #endif - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1552,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser break; case DLL_THREAD_DETACH: #if defined(SMP) - blas_memory_cleanup((void*)get_memory_table()); + blas_thread_memory_cleanup(); #endif break; case DLL_PROCESS_DETACH: diff --git a/exports/dllinit.c b/exports/dllinit.c index 02ff092e9..0e1bb34e3 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -40,15 +40,25 @@ void gotoblas_init(void); void gotoblas_quit(void); +#if defined(SMP) && defined(USE_TLS) +void blas_thread_memory_cleanup(void); +#endif BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { - - if (reason == DLL_PROCESS_ATTACH) { - gotoblas_init(); - } - - if (reason == DLL_PROCESS_DETACH) { - gotoblas_quit(); + switch(reason) { + case DLL_PROCESS_ATTACH: + gotoblas_init(); + break; + case DLL_PROCESS_DETACH: + gotoblas_quit(); + break; + case DLL_THREAD_ATTACH: + break; + case DLL_THREAD_DETACH: +#if defined(SMP) && defined(USE_TLS) + blas_thread_memory_cleanup(void); +#endif + break; } return TRUE; From 8ba9e2a61a1cf34e9b2efc5af61f5ebaaf6ab902 Mon Sep 17 00:00:00 2001 From: "Erik M. Bray" Date: Tue, 19 Mar 2019 10:22:02 +0100 Subject: [PATCH 132/133] Also call CloseHandle on each thread, as well as on the event so as to not leak thread handles. --- driver/others/blas_server_win32.c | 5 +++++ exports/dllinit.c | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bae344c59..0b38ee365 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ + // Could also just use WaitForMultipleObjects WaitForSingleObject(blas_threads[i], 5); //INFINITE); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP TerminateThread(blas_threads[i],0); #endif + CloseHandle(blas_threads[i]); } + CloseHandle(pool.filled); + CloseHandle(pool.killed); + blas_server_avail = 0; } diff --git a/exports/dllinit.c b/exports/dllinit.c index 0e1bb34e3..4a05c0e14 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -56,7 +56,7 @@ BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { break; case DLL_THREAD_DETACH: #if defined(SMP) && defined(USE_TLS) - blas_thread_memory_cleanup(void); + blas_thread_memory_cleanup(); #endif break; } From b043a5962e3785c9879f671fca8e7226dc70ff4f Mon Sep 17 00:00:00 2001 From: Ayappan P Date: Mon, 25 Mar 2019 18:53:25 +0530 Subject: [PATCH 133/133] AIX asm syntax changes needed for shared object creation --- common_power.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/common_power.h b/common_power.h index 68087b071..60de48a63 100644 --- a/common_power.h +++ b/common_power.h @@ -598,9 +598,14 @@ REALNAME:;\ #ifndef __64BIT__ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .long .REALNAME, TOC[tc0], 0;\ .csect .text[PR],5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\ @@ -611,9 +616,14 @@ _section_.text:;\ #define PROLOGUE \ .machine "any";\ + .toc;\ .globl .REALNAME;\ + .globl REALNAME;\ + .csect REALNAME[DS],3;\ +REALNAME:;\ + .llong .REALNAME, TOC[tc0], 0;\ .csect .text[PR], 5;\ -.REALNAME:; +.REALNAME: #define EPILOGUE \ _section_.text:;\