| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 9.dev) | |||
| set(OpenBLAS_PATCH_VERSION 10.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -249,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| @@ -358,10 +358,21 @@ endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| @@ -365,11 +365,12 @@ clean :: | |||
| @$(MAKE) -C kernel clean | |||
| #endif | |||
| @$(MAKE) -C reference clean | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 | |||
| ifeq ($(OSNAME), Darwin) | |||
| @rm -rf getarch.dSYM getarch_2nd.dSYM | |||
| endif | |||
| @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @rm -f cblas.tmp cblas.tmp2 | |||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @@ -45,7 +45,22 @@ install : lib.grd | |||
| ifndef NO_CBLAS | |||
| @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| @cp cblas.h cblas.tmp | |||
| ifdef SYMBOLPREFIX | |||
| @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp | |||
| #change back any openblas_complex_float and double that got hit | |||
| @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp | |||
| endif | |||
| ifdef SYMBOLSUFFIX | |||
| @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp | |||
| #change back any openblas_complex_float and double that got hit | |||
| @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp | |||
| endif | |||
| @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @@ -168,4 +183,3 @@ endif | |||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo Install OK! | |||
| @@ -10,54 +10,36 @@ USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| CCOMMON_OPT += -Ofast -mvsx -fno-fast-math | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| CCOMMON_OPT += -mcpu=power8 -mtune=power8 | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | |||
| CCOMMON_OPT += -mcpu=power9 -mtune=power9 | |||
| endif | |||
| else | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -fno-fast-math | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| FCOMMON_OPT += -mcpu=power8 -mtune=power8 | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| FCOMMON_OPT += -mcpu=power9 -mtune=power9 | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive -DUSE_OPENMP -mp | |||
| endif | |||
| else | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math | |||
| else | |||
| @@ -73,6 +55,18 @@ else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -DUSE_OPENMP -fopenmp | |||
| else | |||
| CCOMMON_OPT += -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -DUSE_OPENMP -fopenmp | |||
| else | |||
| FCOMMON_OPT += -DUSE_OPENMP -mp | |||
| endif | |||
| endif | |||
| # workaround for C->FORTRAN ABI violation in LAPACKE | |||
| @@ -38,6 +38,7 @@ | |||
| #include <sys/utsname.h> | |||
| #ifdef _AIX | |||
| #include <sys/systemcfg.h> | |||
| #include <sys/vminfo.h> | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| @@ -137,35 +138,19 @@ int detect(void){ | |||
| #endif | |||
| #ifdef _AIX | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = popen("prtconf|grep 'Processor Type'", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("Pro", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| pclose(infile); | |||
| // Cast from int to unsigned to ensure comparisons work for all bits in | |||
| // the bit mask, even the top bit | |||
| unsigned implementation = (unsigned) _system_configuration.implementation; | |||
| if (strstr(p, "POWER3")) return CPUTYPE_POWER3; | |||
| if (strstr(p, "POWER4")) return CPUTYPE_POWER4; | |||
| if (strstr(p, "PPC970")) return CPUTYPE_PPC970; | |||
| if (strstr(p, "POWER5")) return CPUTYPE_POWER5; | |||
| if (strstr(p, "POWER6")) return CPUTYPE_POWER6; | |||
| if (strstr(p, "POWER7")) return CPUTYPE_POWER6; | |||
| if (strstr(p, "POWER8")) return CPUTYPE_POWER8; | |||
| if (strstr(p, "POWER9")) return CPUTYPE_POWER9; | |||
| if (strstr(p, "POWER10")) return CPUTYPE_POWER10; | |||
| if (strstr(p, "Cell")) return CPUTYPE_CELL; | |||
| if (strstr(p, "7447")) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| if (implementation >= 0x40000u) return CPUTYPE_POWER10; | |||
| else if (implementation & 0x20000) return CPUTYPE_POWER9; | |||
| else if (implementation & 0x10000) return CPUTYPE_POWER8; | |||
| else if (implementation & 0x08000) return CPUTYPE_POWER7; // POWER 7 | |||
| else if (implementation & 0x04000) return CPUTYPE_POWER6; | |||
| else if (implementation & 0x02000) return CPUTYPE_POWER5; | |||
| else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 | |||
| else if (implementation & 0x00800) return CPUTYPE_POWER4; | |||
| else return CPUTYPE_POWER3; | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| @@ -47,8 +47,10 @@ endif | |||
| endif | |||
| ifdef USE_CUDA | |||
| ifeq ($(USE_CUDA), 1) | |||
| COMMONOBJS += cuda_init.$(SUFFIX) | |||
| endif | |||
| endif | |||
| ifdef FUNCTION_PROFILE | |||
| COMMONOBJS += profile.$(SUFFIX) | |||
| @@ -187,12 +187,12 @@ ZSWAPKERNEL = zswap.c | |||
| # | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n_power10.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t_power10.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| @@ -0,0 +1,268 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * 2016/03/30 Werner Saar (wernsaar@googlemail.com) | |||
| * BLASTEST : OK | |||
| * CTEST : OK | |||
| * TEST : OK | |||
| * LAPACK-TEST : OK | |||
| **************************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha) | |||
| { | |||
| double *a0; | |||
| double *a1; | |||
| double *a2; | |||
| double *a3; | |||
| __asm__ | |||
| ( | |||
| "lxvp 40, 0(%10) \n\t" // x0, x1 | |||
| XXSPLTD_S(32,%x9,0) // alpha, alpha | |||
| "sldi %6, %13, 3 \n\t" // lda * sizeof (double) | |||
| "xvmuldp 34, 41, 32 \n\t" // x0 * alpha, x1 * alpha | |||
| "xvmuldp 35, 40, 32 \n\t" // x2 * alpha, x3 * alpha | |||
| "add %4, %3, %6 \n\t" // a0 = ap, a1 = a0 + lda | |||
| "add %6, %6, %6 \n\t" // 2 * lda | |||
| XXSPLTD_S(32,34,1) // x0 * alpha, x0 * alpha | |||
| XXSPLTD_S(33,34,0) // x1 * alpha, x1 * alpha | |||
| XXSPLTD_S(34,35,1) // x2 * alpha, x2 * alpha | |||
| XXSPLTD_S(35,35,0) // x3 * alpha, x3 * alpha | |||
| "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda | |||
| "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda | |||
| "dcbt 0, %3 \n\t" | |||
| "dcbt 0, %4 \n\t" | |||
| "dcbt 0, %5 \n\t" | |||
| "dcbt 0, %6 \n\t" | |||
| "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] | |||
| "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] | |||
| "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] | |||
| "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] | |||
| "dcbt 0, %2 \n\t" | |||
| "addi %3, %3, 32 \n\t" | |||
| "addi %4, %4, 32 \n\t" | |||
| "addi %5, %5, 32 \n\t" | |||
| "addi %6, %6, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "ble two%= \n\t" | |||
| ".align 5 \n" | |||
| "one%=: \n\t" | |||
| "lxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "addi %3, %3, 32 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "addi %4, %4, 32 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "addi %5, %5, 32 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "stxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] | |||
| "addi %6, %6, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "ble two%= \n\t" | |||
| "lxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "addi %3, %3, 32 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "addi %4, %4, 32 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "addi %5, %5, 32 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "stxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] | |||
| "addi %6, %6, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "ble two%= \n\t" | |||
| "lxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "addi %3, %3, 32 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "addi %4, %4, 32 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "addi %5, %5, 32 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "stxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] | |||
| "addi %6, %6, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "ble two%= \n\t" | |||
| "lxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "lxvp 40, 0(%3) \n\t" // a0[0], a0[1] | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "addi %3, %3, 32 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "lxvp 42, 0(%4) \n\t" // a1[0], a1[1] | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "addi %4, %4, 32 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "lxvp 44, 0(%5) \n\t" // a2[0], a2[1] | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "addi %5, %5, 32 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "stxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "lxvp 46, 0(%6) \n\t" // a3[0], a3[1] | |||
| "addi %6, %6, 32 \n\t" | |||
| "addi %2, %2, 32 \n\t" | |||
| "addic. %1, %1, -4 \n\t" | |||
| "bgt one%= \n" | |||
| "two%=: \n\t" | |||
| "lxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "xvmaddadp 36, 40, 32 \n\t" | |||
| "xvmaddadp 37, 41, 32 \n\t" | |||
| "xvmaddadp 36, 42, 33 \n\t" | |||
| "xvmaddadp 37, 43, 33 \n\t" | |||
| "xvmaddadp 36, 44, 34 \n\t" | |||
| "xvmaddadp 37, 45, 34 \n\t" | |||
| "xvmaddadp 36, 46, 35 \n\t" | |||
| "xvmaddadp 37, 47, 35 \n\t" | |||
| "stxvp 36, 0(%2) \n\t" // y0, y1 | |||
| "#n=%1 ap=%8=%12 lda=%13 x=%7=%10 y=%0=%2 alpha=%9 o16=%11\n" | |||
| "#a0=%3 a1=%4 a2=%5 a3=%6" | |||
| : | |||
| "+m" (*y), | |||
| "+r" (n), // 1 | |||
| "+b" (y), // 2 | |||
| "=b" (a0), // 3 | |||
| "=b" (a1), // 4 | |||
| "=&b" (a2), // 5 | |||
| "=&b" (a3) // 6 | |||
| : | |||
| "m" (*x), | |||
| "m" (*ap), | |||
| "d" (alpha), // 9 | |||
| "r" (x), // 10 | |||
| "b" (16), // 11 | |||
| "3" (ap), // 12 | |||
| "4" (lda) // 13 | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,565 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef __vector unsigned char vec_t; | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef __vector_pair __attribute__((aligned(8))) vecp_t; | |||
| #include "dgemv_n_microk_power10.c" | |||
| #define MMA(X, APTR, ACC) \ | |||
| rX = (vec_t *) & X; \ | |||
| rowA = *((vecp_t*)((void*)&APTR)); \ | |||
| __builtin_mma_xvf64gerpp (ACC, rowA, rX[0]); | |||
| #define SAVE(ACC, Z) \ | |||
| rowC = (v4sf_t *) &y[Z]; \ | |||
| __builtin_mma_disassemble_acc ((void *)result, ACC); \ | |||
| result[0][1] = result[1][0]; \ | |||
| result[2][1] = result[3][0]; \ | |||
| rowC[0] += valpha * result[0]; \ | |||
| rowC[1] += valpha * result[2]; | |||
| void | |||
| dgemv_kernel_4x128 (BLASLONG n, FLOAT * a_ptr, BLASLONG lda, FLOAT * xo, | |||
| FLOAT * y, FLOAT alpha) | |||
| { | |||
| BLASLONG i, j, tmp; | |||
| FLOAT *a0 = a_ptr; | |||
| FLOAT *x1 = xo; | |||
| vector double valpha = { alpha, alpha }; | |||
| v4sf_t *rowC; | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| v4sf_t result[4]; | |||
| vecp_t rowA; | |||
| vec_t *rX; | |||
| tmp = (n / 32) * 32; | |||
| for (i = 0; i < tmp; i += 32) | |||
| { | |||
| xo = x1; | |||
| a0 = a_ptr; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| __builtin_mma_xxsetaccz (&acc1); | |||
| __builtin_mma_xxsetaccz (&acc2); | |||
| __builtin_mma_xxsetaccz (&acc3); | |||
| __builtin_mma_xxsetaccz (&acc4); | |||
| __builtin_mma_xxsetaccz (&acc5); | |||
| __builtin_mma_xxsetaccz (&acc6); | |||
| __builtin_mma_xxsetaccz (&acc7); | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + 0 + j * lda], &acc0); | |||
| MMA (xo[j], a0[i + 4 + j * lda], &acc1); | |||
| MMA (xo[j], a0[i + 8 + j * lda], &acc2); | |||
| MMA (xo[j], a0[i + 12 + j * lda], &acc3); | |||
| MMA (xo[j], a0[i + 16 + j * lda], &acc4); | |||
| MMA (xo[j], a0[i + 20 + j * lda], &acc5); | |||
| MMA (xo[j], a0[i + 24 + j * lda], &acc6); | |||
| MMA (xo[j], a0[i + 28 + j * lda], &acc7); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| SAVE (&acc0, i + 0); | |||
| SAVE (&acc1, i + 4); | |||
| SAVE (&acc2, i + 8); | |||
| SAVE (&acc3, i + 12); | |||
| SAVE (&acc4, i + 16); | |||
| SAVE (&acc5, i + 20); | |||
| SAVE (&acc6, i + 24); | |||
| SAVE (&acc7, i + 28); | |||
| } | |||
| for (i = tmp; i < n; i += 4) | |||
| { | |||
| xo = x1; | |||
| a0 = a_ptr; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| for (j = 0; j < 32; j++) | |||
| { | |||
| __builtin_prefetch (xo+j); | |||
| __builtin_prefetch (a0+i+j+lda); | |||
| MMA (xo[j], a0[i + j * lda], &acc0); | |||
| } | |||
| xo += 32; | |||
| a0 += lda << 5; | |||
| SAVE (&acc0, i); | |||
| } | |||
| } | |||
| #define NBMAX 4096 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| FLOAT *a0 = a_ptr; | |||
| FLOAT *a1 = a0 + lda; | |||
| FLOAT *a2 = a1 + lda; | |||
| FLOAT *a3 = a2 + lda; | |||
| for ( i=0; i<4; i++) | |||
| x[i] = xo[i] * alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| for ( i=0; i<2; i++) | |||
| x[i] = xo[i] * alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x1 | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT x[4] __attribute__ ((aligned (16)));; | |||
| for ( i=0; i<1; i++) | |||
| x[i] = xo[i] * alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0]; | |||
| y[i+1] += a0[i+1]*x[0]; | |||
| y[i+2] += a0[i+2]*x[0]; | |||
| y[i+3] += a0[i+3]*x[0]; | |||
| } | |||
| } | |||
| #endif | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda128 = lda << 7; | |||
| FLOAT xbuffer[8] __attribute__ ((aligned (16))); | |||
| FLOAT *ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| BLASLONG n128 = n >> 7; | |||
| n1 = (n - (n128 * 128)) >> 2; | |||
| n2 = (n - (n128 * 128)) & 3; | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*8); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n128 ; i++) | |||
| { | |||
| dgemv_kernel_4x128(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda128; | |||
| x_ptr += 128; | |||
| } | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n128 ; i++) | |||
| { | |||
| FLOAT xbuffer[128] __attribute__ ((aligned (16))); | |||
| BLASLONG j; | |||
| for ( j = 0; j < 128 ; j++) | |||
| { | |||
| xbuffer[j] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| } | |||
| dgemv_kernel_4x128(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda128; | |||
| } | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,840 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #define NBMAX 1024 | |||
| //#define PREFETCH 1 | |||
| #include <altivec.h> | |||
| #define HAVE_KERNEL4x8_ASM 1 | |||
| #if defined(HAVE_KERNEL4x8_ASM) | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, double *ap, double *x, double *y, double alpha) { | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| BLASLONG off2; | |||
| BLASLONG tempR; | |||
| __asm__( | |||
| "sldi %[temp],%[off], 4 \n\t" // lda * sizeof (double) *2 | |||
| "sldi %[off], %[off], 3 \n\t" // lda * sizeof (double) | |||
| "xxlxor 34,34,34 \n\t" | |||
| "xxlxor 35,34,34 \n\t" | |||
| "add %[a2], %[a0], %[temp] \n\t" | |||
| "add %[a1], %[a0], %[off] \n\t" | |||
| "xxlxor 4,34,34 \n\t" | |||
| "xxlxor 5,34,34 \n\t" | |||
| "xxlxor 6,34,34 \n\t" | |||
| "xxlxor 7,34,34 \n\t" | |||
| "add %[a3], %[a2], %[off] \n\t" | |||
| "add %[a4], %[a2], %[temp] \n\t" | |||
| "xxlxor 8,34,34 \n\t" | |||
| "xxlxor 9,34,34 \n\t" | |||
| "add %[a5], %[a3], %[temp] \n\t" | |||
| "li %[off],0 \n\t" | |||
| "li %[off2],16 \n\t" | |||
| "add %[a6], %[a4], %[temp] \n\t" | |||
| "add %[a7], %[a5], %[temp] \n\t" | |||
| "lxvp 32, 0(%[x]) \n\t" | |||
| "lxvp 36, 0(%[a0]) \n\t" | |||
| "lxvp 38, 0(%[a1]) \n\t" | |||
| "lxvp 40, 0(%[a2]) \n\t" | |||
| "lxvp 42, 0(%[a3]) \n\t" | |||
| "lxvp 44, 0(%[a4]) \n\t" | |||
| "lxvp 46, 0(%[a5]) \n\t" | |||
| "lxvp 48, 0(%[a6]) \n\t" | |||
| "lxvp 50, 0(%[a7]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "li %[temp],896 \n\t" | |||
| #endif | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "li %[off],32 \n\t" | |||
| "ble- two%= \n\t" | |||
| //-------------------------------------------------- | |||
| ".align 5 \n\t" | |||
| "one%=: \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 32(%[a0]) \n\t" | |||
| "lxvp 38, 32(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 32(%[a2]) \n\t" | |||
| "lxvp 42, 32(%[a3]) \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 32(%[a4]) \n\t" | |||
| "lxvp 46, 32(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 32(%[a6]) \n\t" | |||
| "lxvp 50, 32(%[a7]) \n\t" | |||
| "lxvp 32, 32(%[x]) \n\t" | |||
| "ble- two%= \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 64(%[a0]) \n\t" | |||
| "lxvp 38, 64(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 64(%[a2]) \n\t" | |||
| "lxvp 42, 64(%[a3]) \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 64(%[a4]) \n\t" | |||
| "lxvp 46, 64(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 64(%[a6]) \n\t" | |||
| "lxvp 50, 64(%[a7]) \n\t" | |||
| "lxvp 32, 64(%[x]) \n\t" | |||
| "ble- two%= \n\t" | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "addi %[temp],%[temp],128 \n\t" | |||
| #endif | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a0] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 96(%[a0]) \n\t" | |||
| "lxvp 38, 96(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a1] \n\t" | |||
| #endif | |||
| "lxvp 40, 96(%[a2]) \n\t" | |||
| "lxvp 42, 96(%[a3]) \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 96(%[a4]) \n\t" | |||
| "lxvp 46, 96(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a3] \n\t" | |||
| #endif | |||
| "lxvp 48, 96(%[a6]) \n\t" | |||
| "lxvp 50, 96(%[a7]) \n\t" | |||
| "lxvp 32, 96(%[x]) \n\t" | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "ble- two%= \n\t" | |||
| "addi %[off2], %[off2],32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a2] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a4] \n\t" | |||
| #endif | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a5] \n\t" | |||
| #endif | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "lxvp 36, 128(%[a0]) \n\t" | |||
| "lxvp 38, 128(%[a1]) \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "addi %[off], %[off],32 \n\t" | |||
| "lxvp 40, 128(%[a2]) \n\t" | |||
| "lxvp 42, 128(%[a3]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a6] \n\t" | |||
| #endif | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "lxvp 44, 128(%[a4]) \n\t" | |||
| "lxvp 46, 128(%[a5]) \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[a7] \n\t" | |||
| #endif | |||
| "addic. %[n],%[n],-4 \n\t" | |||
| "lxvp 48, 128(%[a6]) \n\t" | |||
| "lxvp 50, 128(%[a7]) \n\t" | |||
| "lxvp 32, 128(%[x]) \n\t" | |||
| #if defined(PREFETCH) | |||
| "dcbt %[temp],%[x] \n\t" | |||
| #endif | |||
| "addi %[a0], %[a0], 128 \n\t" | |||
| "addi %[a1], %[a1], 128 \n\t" | |||
| "addi %[a2], %[a2], 128 \n\t" | |||
| "addi %[a3], %[a3], 128 \n\t" | |||
| "addi %[a4], %[a4], 128 \n\t" | |||
| "addi %[a5], %[a5], 128 \n\t" | |||
| "addi %[a6], %[a6], 128 \n\t" | |||
| "addi %[a7], %[a7], 128 \n\t" | |||
| "addi %[x], %[x], 128 \n\t" | |||
| "bgt+ one%= \n\t" | |||
| ".align 5 \n\t" | |||
| "two%=: \n\t" | |||
| //-------------------------------------------- | |||
| "xvmaddadp 34,36,32 \n\t" | |||
| "xvmaddadp 35,38,32 \n\t" | |||
| "xvmaddadp 4,40,32 \n\t" | |||
| "xvmaddadp 5,42,32 \n\t" | |||
| "xvmaddadp 6,44,32 \n\t" | |||
| "xvmaddadp 7,46,32 \n\t" | |||
| "xvmaddadp 8,48,32 \n\t" | |||
| "xvmaddadp 9,50,32 \n\t" | |||
| XXSPLTD_S(36,%x[alpha],0) | |||
| "xvmaddadp 34,37,33 \n\t" | |||
| "xvmaddadp 35,39,33 \n\t" | |||
| "xvmaddadp 4,41,33 \n\t" | |||
| "xvmaddadp 5,43,33 \n\t" | |||
| "xvmaddadp 6,45,33 \n\t" | |||
| "xvmaddadp 7,47,33 \n\t" | |||
| "xvmaddadp 8,49,33 \n\t" | |||
| "xvmaddadp 9,51,33 \n\t" | |||
| "lxvp 38, 0(%[y]) \n\t" | |||
| "lxvp 40, 32(%[y]) \n\t" | |||
| XXMRGLD_S(42,35,34) | |||
| XXMRGHD_S(43,35,34) | |||
| XXMRGLD_S(44,5,4) | |||
| XXMRGHD_S(45,5,4) | |||
| "xvadddp 42,42,43 \n\t" | |||
| XXMRGLD_S(46,7,6) | |||
| XXMRGHD_S(47,7,6) | |||
| "xvadddp 44,44,45 \n\t" | |||
| XXMRGLD_S(48,9,8) | |||
| XXMRGHD_S(49,9,8) | |||
| "xvadddp 46,46,47 \n\t" | |||
| "xvmaddadp 39,42,36 \n\t" | |||
| "xvmaddadp 38,44,36 \n\t" | |||
| "xvadddp 48,48,49 \n\t" | |||
| "xvmaddadp 41,46,36 \n\t" | |||
| "stxvp 38, 0(%[y]) \n\t" | |||
| "xvmaddadp 40,48,36 \n\t" | |||
| "stxvp 40, 32(%[y]) \n\t" | |||
| : [memy] "+m" (*(double (*)[8])y), | |||
| [n] "+&r" (n), | |||
| [a0] "=b" (a0), | |||
| [a1] "=&b" (a1), | |||
| [a2] "=&b" (a2), | |||
| [a3] "=&b" (a3), | |||
| [a4] "=&b" (a4), | |||
| [a5] "=&b" (a5), | |||
| [a6] "=&b" (a6), | |||
| [a7] "=&b" (a7), | |||
| [off] "+&b" (lda), | |||
| [off2]"=&b" (off2), | |||
| [temp] "=&b" (tempR) | |||
| : [memx] "m" (*(const double (*)[n])x), | |||
| [mem_ap] "m" (*(const double (*)[n*8]) ap), | |||
| [alpha] "d" (alpha), | |||
| "[a0]" (ap), | |||
| [x] "b" (x), | |||
| [y] "b" (y) | |||
| : "cc","vs4","vs5","vs6","vs7","vs8","vs9" ,"vs32","vs33","vs34","vs35", "vs36", "vs37", "vs38", "vs39", | |||
| "vs40", "vs41", "vs42", "vs43", "vs44", "vs45", "vs46", "vs47", "vs48", "vs49", "vs50", "vs51" | |||
| ); | |||
| return; | |||
| } | |||
| #else | |||
| static void dgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| #if defined(PREFETCH) | |||
| BLASLONG j, c, k; | |||
| #endif | |||
| FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; | |||
| __vector double *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; | |||
| register __vector double temp0 = {0, 0}; | |||
| register __vector double temp1 = {0, 0}; | |||
| register __vector double temp2 = {0, 0}; | |||
| register __vector double temp3 = {0, 0}; | |||
| register __vector double temp4 = {0, 0}; | |||
| register __vector double temp5 = {0, 0}; | |||
| register __vector double temp6 = {0, 0}; | |||
| register __vector double temp7 = {0, 0}; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| a4 = a3 + lda; | |||
| a5 = a4 + lda; | |||
| a6 = a5 + lda; | |||
| a7 = a6 + lda; | |||
| va0 = (__vector double*) a0; | |||
| va1 = (__vector double*) a1; | |||
| va2 = (__vector double*) a2; | |||
| va3 = (__vector double*) a3; | |||
| va4 = (__vector double*) a4; | |||
| va5 = (__vector double*) a5; | |||
| va6 = (__vector double*) a6; | |||
| va7 = (__vector double*) a7; | |||
| v_x = (__vector double*) x; | |||
| #if defined(PREFETCH) | |||
| c = n >> 1; | |||
| for (j = 0; j < c; j += 64) { | |||
| k = (c - j) > 64 ? 64 : (c - j); | |||
| __builtin_prefetch(v_x + 64); | |||
| __builtin_prefetch(va0 + 64); | |||
| __builtin_prefetch(va1 + 64); | |||
| __builtin_prefetch(va2 + 64); | |||
| __builtin_prefetch(va3 + 64); | |||
| __builtin_prefetch(va4 + 64); | |||
| __builtin_prefetch(va5 + 64); | |||
| __builtin_prefetch(va6 + 64); | |||
| __builtin_prefetch(va7 + 64); | |||
| for (i = 0; i < k; i += 2) { | |||
| #else | |||
| for (i = 0; i < n/2; i += 2) { | |||
| #endif | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i] * va4[i]; | |||
| temp5 += v_x[i] * va5[i]; | |||
| temp6 += v_x[i] * va6[i]; | |||
| temp7 += v_x[i] * va7[i]; | |||
| temp0 += v_x[i + 1] * va0[i + 1]; | |||
| temp1 += v_x[i + 1] * va1[i + 1]; | |||
| temp2 += v_x[i + 1] * va2[i + 1]; | |||
| temp3 += v_x[i + 1] * va3[i + 1]; | |||
| temp4 += v_x[i + 1] * va4[i + 1]; | |||
| temp5 += v_x[i + 1] * va5[i + 1]; | |||
| temp6 += v_x[i + 1] * va6[i + 1]; | |||
| temp7 += v_x[i + 1] * va7[i + 1]; | |||
| } | |||
| #if defined(PREFETCH) | |||
| va0 += 64; | |||
| va1 += 64; | |||
| va2 += 64; | |||
| va3 += 64; | |||
| va4 += 64; | |||
| va5 += 64; | |||
| va6 += 64; | |||
| va7 += 64; | |||
| v_x += 64; | |||
| } | |||
| #endif | |||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]); | |||
| y[4] += alpha * (temp4[0] + temp4[1]); | |||
| y[5] += alpha * (temp5[0] + temp5[1]); | |||
| y[6] += alpha * (temp6[0] + temp6[1]); | |||
| y[7] += alpha * (temp7[0] + temp7[1]); | |||
| } | |||
| #endif | |||
| static void dgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i = 0; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector double* va0 = (__vector double*) a0; | |||
| __vector double* va1 = (__vector double*) a1; | |||
| __vector double* va2 = (__vector double*) a2; | |||
| __vector double* va3 = (__vector double*) a3; | |||
| __vector double* v_x = (__vector double*) x; | |||
| register __vector double temp0 = {0, 0}; | |||
| register __vector double temp1 = {0, 0}; | |||
| register __vector double temp2 = {0, 0}; | |||
| register __vector double temp3 = {0, 0}; | |||
| register __vector double temp4 = {0, 0}; | |||
| register __vector double temp5 = {0, 0}; | |||
| register __vector double temp6 = {0, 0}; | |||
| register __vector double temp7 = {0, 0}; | |||
| for (i = 0; i < n / 2; i += 2) { | |||
| temp0 += v_x[i] * va0[i]; | |||
| temp1 += v_x[i] * va1[i]; | |||
| temp2 += v_x[i] * va2[i]; | |||
| temp3 += v_x[i] * va3[i]; | |||
| temp4 += v_x[i + 1] * va0[i + 1]; | |||
| temp5 += v_x[i + 1] * va1[i + 1]; | |||
| temp6 += v_x[i + 1] * va2[i + 1]; | |||
| temp7 += v_x[i + 1] * va3[i + 1]; | |||
| } | |||
| temp0 += temp4; | |||
| temp1 += temp5; | |||
| temp2 += temp6; | |||
| temp3 += temp7; | |||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||
| y[1] += alpha * (temp1[0] + temp1[1]); | |||
| y[2] += alpha * (temp2[0] + temp2[1]); | |||
| y[3] += alpha * (temp3[0] + temp3[1]); | |||
| } | |||
| static void dgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector double* va0 = (__vector double*) a0; | |||
| __vector double* va1 = (__vector double*) a1; | |||
| __vector double* v_x = (__vector double*) x; | |||
| __vector double temp0 = {0, 0}; | |||
| __vector double temp1 = {0, 0}; | |||
| for (i = 0; i < n / 2; i += 2) { | |||
| temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; | |||
| temp1 += v_x[i] * va1[i] + v_x[i + 1] * va1[i + 1]; | |||
| } | |||
| y[0] += alpha * (temp0[0] + temp0[1]); | |||
| y[inc_y] += alpha * (temp1[0] + temp1[1]); | |||
| } | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { | |||
| BLASLONG i; | |||
| FLOAT *a0; | |||
| a0 = ap; | |||
| __vector double* va0 = (__vector double*) a0; | |||
| __vector double* v_x = (__vector double*) x; | |||
| __vector double temp0 = {0, 0}; | |||
| for (i = 0; i < n / 2; i += 2) { | |||
| temp0 += v_x[i] * va0[i] + v_x[i + 1] * va0[i + 1]; | |||
| } | |||
| *y += alpha * (temp0[0] + temp0[1]); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest++ = *src; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8] __attribute__((aligned(16))); | |||
| FLOAT *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| xbuffer = buffer; | |||
| n1 = n >> 3; | |||
| n2 = n & 7; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 1) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| BLASLONG lda8 = lda << 3; | |||
| if (inc_y == 1) { | |||
| for (i = 0; i < n1; i++) { | |||
| dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); | |||
| y_ptr += 8; | |||
| a_ptr += lda8; | |||
| #if defined(PREFETCH) | |||
| __builtin_prefetch(y_ptr+64); | |||
| #endif | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| ybuffer[4] = 0; | |||
| ybuffer[5] = 0; | |||
| ybuffer[6] = 0; | |||
| ybuffer[7] = 0; | |||
| dgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[4]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[6]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| a_ptr += lda8; | |||
| } | |||
| } | |||
| if (n2 & 4) { | |||
| ybuffer[0] = 0; | |||
| ybuffer[1] = 0; | |||
| ybuffer[2] = 0; | |||
| ybuffer[3] = 0; | |||
| dgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); | |||
| a_ptr += lda<<2; | |||
| *y_ptr += ybuffer[0]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[2]; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| } | |||
| if (n2 & 2) { | |||
| dgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 2 * inc_y; | |||
| } | |||
| if (n2 & 1) { | |||
| dgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if (m3 == 3) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 3 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 2 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; | |||
| y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; | |||
| y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; | |||
| aj += 8; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; | |||
| aj += 2; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if (lda == 1 && inc_y == 1) { | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j + 1] += aj[j + 1] * xtemp; | |||
| y_ptr[j + 2] += aj[j + 2] * xtemp; | |||
| y_ptr[j + 3] += aj[j + 3] * xtemp; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } else { | |||
| if (inc_y == 1) { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for (j = 0; j < (n & -4); j += 4) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j + 1] += *(aj + lda) * xtemp; | |||
| y_ptr[j + 2] += *(aj + lda2) * xtemp; | |||
| y_ptr[j + 3] += *(aj + lda3) * xtemp; | |||
| aj += lda4; | |||
| } | |||
| for (; j < n; j++) { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } else { | |||
| for (j = 0; j < n; j++) { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return (0); | |||
| } | |||