diff --git a/.drone.yml b/.drone.yml deleted file mode 100644 index 779912954..000000000 --- a/.drone.yml +++ /dev/null @@ -1,143 +0,0 @@ ---- -kind: pipeline -name: arm64_gcc_make - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:19.04 - environment: - CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm32_gcc_make - -platform: - os: linux - arch: arm - -steps: -- name: Build and Test - image: ubuntu:19.04 - environment: - CC: gcc - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV6 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm64_clang_make - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - COMMON_FLAGS: 'DYNAMIC_ARCH=1 TARGET=ARMV8 NUM_THREADS=32' - commands: - - echo "MAKE_FLAGS:= $COMMON_FLAGS" - - apt-get update -y - - apt-get install -y make $CC gfortran perl - - $CC --version - - make QUIET_MAKE=1 $COMMON_FLAGS - - make -C test $COMMON_FLAGS - - make -C ctest $COMMON_FLAGS - - make -C utest $COMMON_FLAGS - ---- -kind: pipeline -name: arm32_clang_cmake - -platform: - os: linux - arch: arm - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV6 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest - ---- -kind: pipeline -name: arm64_gcc_cmake - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: gcc - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest - ---- -kind: pipeline -name: arm64_clang_cmake - -platform: - os: linux - arch: arm64 - -steps: -- name: Build and Test - image: ubuntu:18.04 - environment: - CC: clang - CMAKE_FLAGS: '-DDYNAMIC_ARCH=1 -DTARGET=ARMV8 -DNUM_THREADS=32 -DNOFORTRAN=ON -DBUILD_WITHOUT_LAPACK=ON' - commands: - - echo "CMAKE_FLAGS:= $CMAKE_FLAGS" - - apt-get update -y - - apt-get install -y make $CC g++ perl cmake - - $CC --version - - mkdir build && cd build - - cmake $CMAKE_FLAGS .. - - make -j - - ctest diff --git a/.travis.yml b/.travis.yml index a92bb0687..ec5dc8a9b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,15 +25,6 @@ matrix: - TARGET_BOX=LINUX64 - BTYPE="BINARY=64" - - <<: *test-ubuntu - os: linux-ppc64le - before_script: - - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" - env: - # for matrix annotation only - - TARGET_BOX=PPC64LE_LINUX - - BTYPE="BINARY=64 USE_OPENMP=1" - - <<: *test-ubuntu env: - TARGET_BOX=LINUX64 @@ -169,10 +160,45 @@ matrix: - BTYPE="BINARY=64 INTERFACE64=1" - <<: *test-macos - osx_image: xcode8.3 env: - BTYPE="BINARY=32" + - &emulated-arm + dist: trusty + sudo: required + services: docker + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + name: "Emulated Build for ARMV6 with gcc" + before_install: sudo docker run --rm --privileged multiarch/qemu-user-static:register --reset + script: | + echo "FROM openblas/alpine:${IMAGE_ARCH} + COPY . /tmp/openblas + RUN mkdir /tmp/openblas/build && \ + cd /tmp/openblas/build && \ + CC=${COMPILER} cmake -D DYNAMIC_ARCH=OFF \ + -D TARGET=${TARGET_ARCH} \ + -D BUILD_SHARED_LIBS=ON \ + -D BUILD_WITHOUT_LAPACK=ON \ + -D BUILD_WITHOUT_CBLAS=ON \ + -D CMAKE_BUILD_TYPE=Release ../ && \ + cmake --build ." > Dockerfile + docker build . + - <<: *emulated-arm + env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + name: "Emulated Build for ARMV6 with clang" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + name: "Emulated Build for ARMV8 with gcc" + - <<: *emulated-arm + env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + name: "Emulated Build for ARMV8 with clang" + + allow_failures: + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=gcc + - env: IMAGE_ARCH=arm32 TARGET_ARCH=ARMV6 COMPILER=clang + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=gcc + - env: IMAGE_ARCH=arm64 TARGET_ARCH=ARMV8 COMPILER=clang + # whitelist branches: only: diff --git a/CMakeLists.txt b/CMakeLists.txt index d7d9c2fce..812e6bf6f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 7.dev) +set(OpenBLAS_PATCH_VERSION 6.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions @@ -20,14 +20,9 @@ if(MSVC) option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) endif() option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) -option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) -option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) +option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64 only)" OFF) +option(DYNAMIC_OLDER "Include specific support for older cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) -if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") -option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) -else() -set(NO_AFFINITY 1) -endif() # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -47,19 +42,6 @@ endif() ####### -if(MSVC AND MSVC_STATIC_CRT) - set(CompilerFlags - CMAKE_CXX_FLAGS - CMAKE_CXX_FLAGS_DEBUG - CMAKE_CXX_FLAGS_RELEASE - CMAKE_C_FLAGS - CMAKE_C_FLAGS_DEBUG - CMAKE_C_FLAGS_RELEASE - ) - foreach(CompilerFlag ${CompilerFlags}) - string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") - endforeach() -endif() message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") @@ -80,10 +62,10 @@ endif () set(SUBDIRS ${BLASDIRS}) if (NOT NO_LAPACK) + list(APPEND SUBDIRS lapack) if(BUILD_RELAPACK) list(APPEND SUBDIRS relapack/src) endif() - list(APPEND SUBDIRS lapack) endif () # set which float types we want to build for @@ -152,7 +134,7 @@ endif () # Only generate .def for dll on MSVC and always produce pdb files for debug and release if(MSVC) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") endif() set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") @@ -167,9 +149,15 @@ if (${DYNAMIC_ARCH}) endforeach() endif () +# Only build shared libs for MSVC +if (MSVC) + set(BUILD_SHARED_LIBS ON) +endif() + + # add objects to the openblas lib add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) -target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) +target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $) # Android needs to explicitly link against libm if(ANDROID) @@ -178,7 +166,7 @@ endif() # Handle MSVC exports if(MSVC AND BUILD_SHARED_LIBS) - if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) + if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") else() # Creates verbose .def file (51KB vs 18KB) @@ -211,8 +199,7 @@ if (USE_THREAD) target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) endif() -#if (MSVC OR NOT NOFORTRAN) -if (NOT NO_CBLAS) +if (MSVC OR NOT NOFORTRAN) # Broken without fortran on unix add_subdirectory(utest) endif() @@ -230,14 +217,6 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES SOVERSION ${OpenBLAS_MAJOR_VERSION} ) -if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) - if (NOT MSVC) - target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") - else() - target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") - endif() -endif() - if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") if (NOT DEFINED ARCH) set(ARCH_IN "x86_64") @@ -335,7 +314,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) if(NOT NOFORTRAN) message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) + set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") @@ -348,11 +327,10 @@ endif() if(NOT NO_CBLAS) message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") - set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") - file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") - install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") + install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) endif() if(NOT NO_LAPACKE) diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 3859a9c19..08f8cc69d 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -167,7 +167,4 @@ In chronological order: * [2017-02-26] ztrmm kernel for IBM z13 * [2017-03-13] strmm and ctrmm kernel for IBM z13 * [2017-09-01] initial Blas Level-1,2 (double precision) for IBM z13 - * [2018-03-07] added missing Blas Level 1-2 (double precision) simd codes - * [2019-02-01] added missing Blas Level-1,2 (single precision) simd codes - * [2019-03-14] power9 dgemm/dtrmm kernel - * [2019-04-29] power9 sgemm/strmm kernel + diff --git a/Changelog.txt b/Changelog.txt index 8df35d5c3..49b26873a 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,82 +1,4 @@ OpenBLAS ChangeLog -==================================================================== -Version 0.3.6 -29-Apr-2019 - -common: - * the build tools now check that a given cpu TARGET is actually valid - * the build-time check of system features (c_check) has been made - less dependent on particular perl features (this should mainly - benefit building on Windows) - * several problem with the ReLAPACK integration were fixed, - including INTERFACE64 support and building a shared library - * building with CMAKE on BSD systems was improved - * a non-absolute SUM function was added based on the - existing optimized code for ASUM - * CBLAS interfaces to the IxMIN and IxMAX functions were added - * a name clash between LAPACKE and BOOST headers was resolved - * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel - kernels - * a crash on thread (key) deletion with the USE_TLS=1 memory management - option was fixed - * restored several earlier fixes, in particular for OpenMP performance, - building on BSD, and calling fork on CYGWIN, which had inadvertently - been dropped in the 0.3.3 rewrite of the memory management code. - -x86_64: - * the AVX512 DGEMM kernel has been disabled again due to unsolved problems - * building with old versions of MSVC was fixed - * it is now possible to build a static library on Windows with CMAKE - * accessing environment variables on CYGWIN at run time was fixed - * the CMAKE build system now recognizes 32bit userspace on 64bit hardware - * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected - * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported - with CMAKE as well - * building for DYNAMIC_ARCH with GENERIC as the default target is now supported - * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed - * assembly bugs involving undeclared modification of input operands were fixed - in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, - Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause - test failures or segfaults when compiled with recent versions of gcc from 8 onward. - * a similar bug was fixed in the blas_quickdivide code used to split workloads - in most functions - * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX - * fixed building on SkylakeX systems when either the compiler or the (emulated) operating - environment does not support AVX512 - * improved GEMM performance on ZEN targets - -x86: - * build failures caused by the recently added checks for AVX512 were fixed - * an inline assembly bug involving undeclared modification of an input argument was - fixed in the blas_quickdivide code used to split workloads in most functions - * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX - -MIPS32: - * a bug in the IMIN implementation made it return the result of IMAX - -POWER: - * single precision BLAS1/2 functions have received optimized POWER8 kernels - * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel - * building on PPC970 systems under OSX Leopard or Tiger is now supported - * out-of-bounds memory accesses in the gemm_beta microkernels were fixed - * building a shared library on AIX is now supported for POWER6 - * DYNAMIC_ARCH support has been added for POWER6 and newer - -ARMv7: - * corrected xDOT behaviour with zero INC_X or INC_Y - * a bug in the IMIN implementation made it return the result of IMAX - -ARMv8: - * added support for HiSilicon TSV110 cpus - * the CMAKE build system now recognizes 32bit userspace on 64bit hardware - * cross-compilation with CMAKE now works again - * a bug in the IMIN implementation made it return the result of IMAX - * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 - -IBM Z: - * optimized microkernels for single precicion BLAS1/2 functions have been added - for both Z13 and Z14 - ==================================================================== Version 0.3.5 31-Dec-2018 diff --git a/Makefile b/Makefile index 07b08439e..21096f893 100644 --- a/Makefile +++ b/Makefile @@ -34,7 +34,7 @@ endif LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) -SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test +SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench .PHONY : all libs netlib $(RELA) test ctest shared install .NOTPARALLEL : all libs $(RELA) prof lapack-test install blas-test @@ -96,7 +96,7 @@ endif @echo shared : -ifneq ($(NO_SHARED), 1) +ifndef NO_SHARED ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @$(MAKE) -C exports so @ln -fs $(LIBSONAME) $(LIBPREFIX).so @@ -123,13 +123,10 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) touch $(LIBNAME) ifndef NO_FBLAS $(MAKE) -C test all -endif $(MAKE) -C utest all +endif ifndef NO_CBLAS $(MAKE) -C ctest all -ifeq ($(CPP_THREAD_SAFETY_TEST), 1) - $(MAKE) -C cpp_thread_test all -endif endif endif diff --git a/Makefile.arm64 b/Makefile.arm64 index 4d10ff684..cd16dbfae 100644 --- a/Makefile.arm64 +++ b/Makefile.arm64 @@ -38,8 +38,3 @@ ifeq ($(CORE), THUNDERX2T99) CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 endif - -ifeq ($(CORE), TSV110) -CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 -FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 -endif diff --git a/Makefile.install b/Makefile.install index fefecd98d..069c96c6a 100644 --- a/Makefile.install +++ b/Makefile.install @@ -58,14 +58,14 @@ ifndef NO_LAPACKE endif #for install static library -ifneq ($(NO_STATIC),1) +ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifneq ($(NO_SHARED),1) +ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @@ -106,14 +106,14 @@ ifndef NO_LAPACKE endif #for install static library -ifneq ($(NO_STATIC),1) +ifndef NO_STATIC @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) endif #for install shared library -ifneq ($(NO_SHARED),1) +ifndef NO_SHARED @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ @@ -138,7 +138,7 @@ endif @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" -ifneq ($(NO_SHARED),1) +ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" diff --git a/Makefile.power b/Makefile.power index 24d8aa8a7..a49372ad7 100644 --- a/Makefile.power +++ b/Makefile.power @@ -9,15 +9,7 @@ else USE_OPENMP = 1 endif -ifeq ($(CORE), POWER9) -ifeq ($(USE_OPENMP), 1) -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp -else -COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math -FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math -endif -endif + ifeq ($(CORE), POWER8) ifeq ($(USE_OPENMP), 1) @@ -29,10 +21,6 @@ FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fas endif endif -# workaround for C->FORTRAN ABI violation in LAPACKE -ifeq ($(F_COMPILER), GFORTRAN) -FCOMMON_OPT += -fno-optimize-sibling-calls -endif FLAMEPATH = $(HOME)/flame/lib diff --git a/Makefile.rule b/Makefile.rule index a299588e0..91f42e396 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.7.dev +VERSION = 0.3.6.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -58,12 +58,6 @@ VERSION = 0.3.7.dev # For force setting for multi threaded, specify USE_THREAD = 1 # USE_THREAD = 0 -# If you want to build a single-threaded OpenBLAS, but expect to call this -# from several concurrent threads in some other program, comment this in for -# thread safety. (This is done automatically for USE_THREAD=1 , and should not -# be necessary when USE_OPENMP=1) -# USE_LOCKING = 1 - # If you're going to use this library with OpenMP, please comment it in. # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. # USE_OPENMP = 1 @@ -163,10 +157,6 @@ NO_AFFINITY = 1 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6) # NO_AVX2 = 1 -# Don't use SkylakeX optimizations if binutils or compiler are too old (the build -# system will try to determine this automatically) -# NO_AVX512 = 1 - # Don't use parallel make. # NO_PARALLEL_MAKE = 1 @@ -191,17 +181,17 @@ NO_AFFINITY = 1 # time out to improve performance. This number should be from 4 to 30 # which corresponds to (1 << n) cycles. For example, if you set to 26, # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz -# system). Also you can control this number by THREAD_TIMEOUT +# system). Also you can control this mumber by THREAD_TIMEOUT # CCOMMON_OPT += -DTHREAD_TIMEOUT=26 -# Using special device driver for mapping physically contiguous memory +# Using special device driver for mapping physically contigous memory # to the user space. If bigphysarea is enabled, it will use it. # DEVICEDRIVER_ALLOCATION = 1 # If you need to synchronize FP CSR between threads (for x86/x86_64 only). # CONSISTENT_FPCSR = 1 -# If any gemm argument m, n or k is less or equal this threshold, gemm will be execute +# If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. (Actually in recent versions this is a factor proportional to the # number of floating point operations necessary for the given problem size, no longer # an individual dimension). You can use this setting to avoid the overhead of multi- @@ -209,7 +199,7 @@ NO_AFFINITY = 1 # been reported to be optimal for certain workloads (50 is the recommended value for Julia). # GEMM_MULTITHREAD_THRESHOLD = 4 -# If you need sanity check by comparing results to reference BLAS. It'll be very +# If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). # SANITY_CHECK = 1 @@ -249,21 +239,6 @@ COMMON_PROF = -pg # SYMBOLPREFIX= # SYMBOLSUFFIX= -# Run a C++ based thread safety tester after the build is done. -# This is mostly intended as a developer feature to spot regressions, but users and -# package maintainers can enable this if they have doubts about the thread safety of -# the library, given the configuration in this file. -# By default, the thread safety tester launches 52 concurrent calculations at the same -# time. -# -# Please note that the test uses ~1300 MiB of RAM for the DGEMM test. -# -# The test requires CBLAS to be built, a C++11 capable compiler and the presence of -# an OpenMP implementation. If you are cross-compiling this test will probably not -# work at all. -# -# CPP_THREAD_SAFETY_TEST = 1 - # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 16791bcc2..67c8cd197 100644 --- a/Makefile.system +++ b/Makefile.system @@ -9,11 +9,6 @@ ifndef TOPDIR TOPDIR = . endif -# If ARCH is not set, we use the host system's architecture. -ifndef ARCH -ARCH := $(shell uname -m) -endif - # Catch conflicting usage of ARCH in some BSD environments ifeq ($(ARCH), amd64) override ARCH=x86_64 @@ -142,12 +137,7 @@ endif endif -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -ifeq ($(ARCH), x86_64) -ifneq ($(C_COMPILER), PGI) -GETARCH_FLAGS += -march=native -endif -endif + ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) @@ -165,8 +155,7 @@ GETARCH_FLAGS += -DNO_AVX endif ifeq ($(BINARY), 32) -GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 -NO_AVX512 = 1 +GETARCH_FLAGS += -DNO_AVX endif ifeq ($(NO_AVX2), 1) @@ -247,10 +236,6 @@ SMP = 1 endif endif -ifeq ($(SMP), 1) -USE_LOCKING = -endif - ifndef NEED_PIC NEED_PIC = 1 endif @@ -402,12 +387,6 @@ ifneq ($(MAX_STACK_ALLOC), 0) CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) endif -ifdef USE_LOCKING -ifneq ($(USE_LOCKING), 0) -CCOMMON_OPT += -DUSE_LOCKING -endif -endif - # # Architecture dependent settings # @@ -548,12 +527,6 @@ DYNAMIC_CORE += THUNDERX DYNAMIC_CORE += THUNDERX2T99 endif -ifeq ($(ARCH), power) -DYNAMIC_CORE = POWER6 -DYNAMIC_CORE += POWER8 -DYNAMIC_CORE += POWER9 -endif - # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty ifndef DYNAMIC_CORE override DYNAMIC_ARCH= @@ -764,8 +737,6 @@ CCOMMON_OPT += -DF_INTERFACE_GFORT FCOMMON_OPT += -Wall # make single-threaded LAPACK calls thread-safe #1847 FCOMMON_OPT += -frecursive -# work around ABI problem with passing single-character arguments -FCOMMON_OPT += -fno-optimize-sibling-calls #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc ifneq ($(NO_LAPACK), 1) EXTRALIB += -lgfortran @@ -1071,7 +1042,7 @@ ifdef USE_SIMPLE_THREADED_LEVEL3 CCOMMON_OPT += -DUSE_SIMPLE_THREADED_LEVEL3 endif -ifeq ($(USE_TLS), 1) +ifdef USE_TLS CCOMMON_OPT += -DUSE_TLS endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 99364752f..1b7fe3ef4 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -28,15 +28,11 @@ endif ifeq ($(CORE), HASWELL) ifndef DYNAMIC_ARCH ifndef NO_AVX2 -ifeq ($(C_COMPILER), GCC) CCOMMON_OPT += -mavx2 -endif -ifeq ($(F_COMPILER), GFORTRAN) FCOMMON_OPT += -mavx2 endif endif endif -endif diff --git a/Makefile.zarch b/Makefile.zarch index 47ea1eb71..9ec9dc79f 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -4,7 +4,3 @@ CCOMMON_OPT += -march=z13 -mzvector FCOMMON_OPT += -march=z13 -mzvector endif -ifeq ($(CORE), Z14) -CCOMMON_OPT += -march=z14 -mzvector -FCOMMON_OPT += -march=z14 -mzvector -endif diff --git a/README.md b/README.md index 14815ff00..26055c745 100644 --- a/README.md +++ b/README.md @@ -6,13 +6,11 @@ Travis CI: [![Build Status](https://travis-ci.org/xianyi/OpenBLAS.svg?branch=dev AppVeyor: [![Build status](https://ci.appveyor.com/api/projects/status/09sohd35n8nkkx64/branch/develop?svg=true)](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) -[![Build Status](https://dev.azure.com/xianyi/OpenBLAS/_apis/build/status/xianyi.OpenBLAS?branchName=develop)](https://dev.azure.com/xianyi/OpenBLAS/_build/latest?definitionId=1&branchName=develop) - ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. -Please read the documentation on the OpenBLAS wiki pages: . +Please read the documentation on the OpenBLAS wiki pages: . ## Binary Packages @@ -24,7 +22,7 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge ## Installation from Source -Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code +Download from project homepage, http://xianyi.github.com/OpenBLAS/, or check out the code using Git from https://github.com/xianyi/OpenBLAS.git. ### Dependencies @@ -65,7 +63,9 @@ A debug version can be built using `make DEBUG=1`. ### Compile with MASS support on Power CPU (optional) -The [IBM MASS](https://www.ibm.com/support/home/product/W511326D80541V01/other_software/mathematical_acceleration_subsystem) library consists of a set of mathematical functions for C, C++, and Fortran applications that are tuned for optimum performance on POWER architectures. +The [IBM MASS](http://www-01.ibm.com/software/awdtools/mass/linux/mass-linux.html) library +consists of a set of mathematical functions for C, C++, and Fortran applications that are +are tuned for optimum performance on POWER architectures. OpenBLAS with MASS requires a 64-bit, little-endian OS on POWER. The library can be installed as shown: @@ -115,7 +115,6 @@ Please read `GotoBLAS_01Readme.txt`. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. -- **AMD ZEN**: Uses Haswell codes with some optimizations. #### MIPS64 @@ -134,13 +133,11 @@ Please read `GotoBLAS_01Readme.txt`. #### PPC/PPC64 -- **POWER8**: Optimized BLAS, only for PPC64LE (Little Endian), only with `USE_OPENMP=1` -- **POWER9**: Optimized Level-3 BLAS (real) and some Level-1,2. PPC64LE with OpenMP only. +- **POWER8**: Optmized Level-3 BLAS and some Level-1, only with `USE_OPENMP=1` #### IBM zEnterprise System - **Z13**: Optimized Level-3 BLAS and Level-1,2 (double precision) -- **Z14**: Optimized Level-3 BLAS and Level-1,2 (single precision) ### Supported OS diff --git a/TargetList.txt b/TargetList.txt index 6a57bf1af..3d04a57cf 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -48,7 +48,6 @@ POWER5 POWER6 POWER7 POWER8 -POWER9 PPCG4 PPC970 PPC970MP @@ -91,9 +90,7 @@ CORTEXA73 FALKOR THUNDERX THUNDERX2T99 -TSV110 9.System Z: ZARCH_GENERIC Z13 -Z14 diff --git a/appveyor.yml b/appveyor.yml index 2f9cc7b0b..141d3a130 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -35,14 +35,7 @@ environment: DYNAMIC_ARCH: ON WITH_FORTRAN: no - COMPILER: cl - - COMPILER: MinGW64-gcc-7.2.0-mingw - DYNAMIC_ARCH: OFF - WITH_FORTRAN: ignore - - COMPILER: MinGW64-gcc-7.2.0 - - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 - COMPILER: MinGW-gcc-5.3.0 - WITH_FORTRAN: ignore - + install: - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force @@ -59,17 +52,10 @@ install: before_build: - ps: if (-Not (Test-Path .\build)) { mkdir build } - cd build - - set PATH=%PATH:C:\Program Files\Git\usr\bin;=% - - if [%COMPILER%]==[MinGW-gcc-5.3.0] set PATH=C:\MinGW\bin;C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] set PATH=C:\MinGW\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - - if [%COMPILER%]==[MinGW64-gcc-7.2.0] set PATH=C:\msys64\usr\bin;C:\mingw-w64\x86_64-7.2.0-posix-seh-rt_v5-rev1\mingw64\bin;%PATH% - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. - - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. - - if [%COMPILER%]==[MinGW64-gcc-7.2.0] cmake -G "MSYS Makefiles" -DBINARY=32 -DNOFORTRAN=1 .. - - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. - - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. + - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. - - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. + - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. build_script: - cmake --build . @@ -78,4 +64,3 @@ test_script: - echo Running Test - cd utest - openblas_utest - diff --git a/azure-pipelines.yml b/azure-pipelines.yml deleted file mode 100644 index 9b4c85367..000000000 --- a/azure-pipelines.yml +++ /dev/null @@ -1,51 +0,0 @@ -trigger: - # start a new build for every push - batch: False - branches: - include: - - develop - -jobs: -# manylinux1 is useful to test because the -# standard Docker container uses an old version -# of gcc / glibc -- job: manylinux1_gcc - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - echo "FROM quay.io/pypa/manylinux1_x86_64 - COPY . /tmp/openblas - RUN cd /tmp/openblas && \ - COMMON_FLAGS='DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32' && \ - BTYPE='BINARY=64' CC=gcc && \ - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE && \ - make -C test $COMMON_FLAGS $BTYPE && \ - make -C ctest $COMMON_FLAGS $BTYPE && \ - make -C utest $COMMON_FLAGS $BTYPE" > Dockerfile - docker build . - displayName: Run manylinux1 docker build -- job: Intel_SDE_skx - pool: - vmImage: 'ubuntu-16.04' - steps: - - script: | - # at the time of writing the available Azure Ubuntu vm image - # does not support AVX512VL, so use more recent LTS version - echo "FROM ubuntu:bionic - COPY . /tmp/openblas - RUN apt-get -y update && apt-get -y install \\ - cmake \\ - gfortran \\ - make \\ - wget - RUN mkdir /tmp/SDE && cd /tmp/SDE && \\ - mkdir sde-external-8.35.0-2019-03-11-lin && \\ - wget --quiet -O sde-external-8.35.0-2019-03-11-lin.tar.bz2 https://www.dropbox.com/s/fopsnzj67572sj5/sde-external-8.35.0-2019-03-11-lin.tar.bz2?dl=0 && \\ - tar -xjvf sde-external-8.35.0-2019-03-11-lin.tar.bz2 -C /tmp/SDE/sde-external-8.35.0-2019-03-11-lin --strip-components=1 - RUN cd /tmp/openblas && CC=gcc make QUIET_MAKE=1 DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64 - CMD cd /tmp/openblas && echo 0 > /proc/sys/kernel/yama/ptrace_scope && CC=gcc OPENBLAS_VERBOSE=2 /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/sde64 -cpuid_in /tmp/SDE/sde-external-8.35.0-2019-03-11-lin/misc/cpuid/skx/cpuid.def -- make -C utest DYNAMIC_ARCH=1 NUM_THREADS=32 BINARY=64" > Dockerfile - docker build -t intel_sde . - # we need a privileged docker run for sde process attachment - docker run --privileged intel_sde - displayName: 'Run AVX512 SkylakeX docker build / test' diff --git a/benchmark/gemm.c b/benchmark/gemm.c index dd016a7c3..85bcbc710 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -207,7 +207,7 @@ int main(int argc, char *argv[]){ for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - + fprintf(stderr, " SIZE Flops Time\n"); for (i = from; i <= to; i += step) { diff --git a/benchmark/scripts/R/deig.R b/benchmark/scripts/R/deig.R index c6d541dcf..ece727fb3 100755 --- a/benchmark/scripts/R/deig.R +++ b/benchmark/scripts/R/deig.R @@ -2,8 +2,6 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) options(matprod = "blas") - nfrom <- 128 nto <- 2048 nstep <- 128 @@ -21,6 +19,7 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } + } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -28,21 +27,29 @@ if (p != "") { loops <- as.numeric(p) } -cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) + +cat(sprintf( + "From %.0f To %.0f Step=%.0f Loops=%.0f\n", + nfrom, + nto, + nstep, + loops +)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), nrow = n) + A <- matrix(rnorm(n * n), ncol = n, nrow = n) ev <- 0 z <- system.time(for (l in 1:loops) { ev <- eigen(A) }) - mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) + mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep + } diff --git a/benchmark/scripts/R/dgemm.R b/benchmark/scripts/R/dgemm.R index d7c3e8108..75297dfb8 100755 --- a/benchmark/scripts/R/dgemm.R +++ b/benchmark/scripts/R/dgemm.R @@ -2,8 +2,6 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) options(matprod = "blas") - nfrom <- 128 nto <- 2048 nstep <- 128 @@ -21,6 +19,7 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } + } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -28,13 +27,26 @@ if (p != "") { loops <- as.numeric(p) } -cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) + +cat(sprintf( + "From %.0f To %.0f Step=%.0f Loops=%.0f\n", + nfrom, + nto, + nstep, + loops +)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(runif(n * n), nrow = n) - B <- matrix(runif(n * n), nrow = n) + A <- matrix(runif(n * n), + ncol = n, + nrow = n, + byrow = TRUE) + B <- matrix(runif(n * n), + ncol = n, + nrow = n, + byrow = TRUE) C <- 1 z <- system.time(for (l in 1:loops) { @@ -42,10 +54,11 @@ while (n <= nto) { l <- l + 1 }) - mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) + mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep + } diff --git a/benchmark/scripts/R/dsolve.R b/benchmark/scripts/R/dsolve.R index 46301570b..a3fb78da7 100755 --- a/benchmark/scripts/R/dsolve.R +++ b/benchmark/scripts/R/dsolve.R @@ -2,8 +2,6 @@ argv <- commandArgs(trailingOnly = TRUE) -if (!is.null(options("matprod")[[1]])) options(matprod = "blas") - nfrom <- 128 nto <- 2048 nstep <- 128 @@ -21,6 +19,7 @@ if (length(argv) > 0) { loops <- as.numeric(argv[z]) } } + } p <- Sys.getenv("OPENBLAS_LOOPS") @@ -28,22 +27,31 @@ if (p != "") { loops <- as.numeric(p) } -cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) + +cat(sprintf( + "From %.0f To %.0f Step=%.0f Loops=%.0f\n", + nfrom, + nto, + nstep, + loops +)) cat(sprintf(" SIZE Flops Time\n")) n <- nfrom while (n <= nto) { - A <- matrix(rnorm(n * n), nrow = n) - B <- matrix(rnorm(n * n), nrow = n) + A <- matrix(rnorm(n * n), ncol = n, nrow = n) + B <- matrix(rnorm(n * n), ncol = n, nrow = n) z <- system.time(for (l in 1:loops) { solve(A, B) }) - mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) + mflops <- + (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) st <- sprintf("%.0fx%.0f :", n, n) cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) n <- n + nstep + } diff --git a/c_check b/c_check index 271182c54..9dc237beb 100644 --- a/c_check +++ b/c_check @@ -1,7 +1,7 @@ #!/usr/bin/perl -#use File::Basename; -# use File::Temp qw(tempfile); +use File::Basename; +use File::Temp qw(tempfile); # Checking cross compile $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); $hostarch = "zarch" if ($hostarch eq "s390x"); -#$tmpf = new File::Temp( UNLINK => 1 ); +$tmpf = new File::Temp( UNLINK => 1 ); $binary = $ENV{"BINARY"}; $makefile = shift(@ARGV); @@ -31,25 +31,12 @@ if ($?) { $cross_suffix = ""; -eval "use File::Basename"; -if ($@){ - warn "could not load PERL module File::Basename, emulating its functionality"; - my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); - if ($dirnam ne ".") { - $cross_suffix .= $dirnam . "/"; - } - my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); - if ($basnam =~ /([^\s]*-)(.*)/) { - $cross_suffix .= $1; - } -} else { - if (dirname($compiler_name) ne ".") { - $cross_suffix .= dirname($compiler_name) . "/"; - } +if (dirname($compiler_name) ne ".") { + $cross_suffix .= dirname($compiler_name) . "/"; +} - if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { - $cross_suffix .= $1; - } +if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { + $cross_suffix .= $1; } $compiler = ""; @@ -184,26 +171,20 @@ if ($?) { $have_msa = 0; if (($architecture eq "mips") || ($architecture eq "mips64")) { - eval "use File::Temp qw(tempfile)"; - if ($@){ - warn "could not load PERL module File::Temp, so could not check MSA capatibility"; + $code = '"addvi.b $w0, $w1, 1"'; + $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + print $tmpf "#include \n\n"; + print $tmpf "void main(void){ __asm__ volatile($code); }\n"; + + $args = "$msa_flags -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args"); + system(@cmd) == 0; + if ($? != 0) { + $have_msa = 0; } else { - $tmpf = new File::Temp( UNLINK => 1 ); - $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; - print $tmpf "#include \n\n"; - print $tmpf "void main(void){ __asm__ volatile($code); }\n"; - - $args = "$msa_flags -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args"); - system(@cmd) == 0; - if ($? != 0) { - $have_msa = 0; - } else { - $have_msa = 1; - } - unlink("$tmpf.o"); + $have_msa = 1; } + unlink("$tmpf.o"); } $architecture = x86 if ($data =~ /ARCH_X86/); @@ -223,25 +204,17 @@ $binformat = bin64 if ($data =~ /BINARY_64/); $no_avx512= 0; if (($architecture eq "x86") || ($architecture eq "x86_64")) { - eval "use File::Temp qw(tempfile)"; - if ($@){ - warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; - $no_avx512 = 0; + $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; + print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; + $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; + my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); + system(@cmd) == 0; + if ($? != 0) { + $no_avx512 = 1; } else { -# $tmpf = new File::Temp( UNLINK => 1 ); - ($fh,$tmpf) = tempfile( UNLINK => 1 ); - $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; - print $tmpf "#include \n\nint main(void){ __asm__ volatile($code); }\n"; - $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; - my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); - system(@cmd) == 0; - if ($? != 0) { - $no_avx512 = 1; - } else { - $no_avx512 = 0; - } - unlink("$tmpf.o"); + $no_avx512 = 0; } + unlink("tmpf.o"); } $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; diff --git a/cblas.h b/cblas.h index 1a87074d6..d340a2037 100644 --- a/cblas.h +++ b/cblas.h @@ -73,11 +73,6 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); @@ -93,16 +88,6 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - -CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); -CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); - void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 5a7434551..63fb86fa2 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -73,16 +73,11 @@ if (DYNAMIC_ARCH) endif () if (NOT NO_AVX512) set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) - string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS ${CMAKE_C_FLAGS}) - endif () - if (DYNAMIC_LIST) - set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) endif () endif () if (NOT DYNAMIC_CORE) - message (STATUS "DYNAMIC_ARCH is not supported on this architecture, removing from options") - unset(DYNAMIC_ARCH CACHE) + unset(DYNAMIC_ARCH) endif () endif () diff --git a/cmake/fc.cmake b/cmake/fc.cmake index f54c989d4..adec28a91 100644 --- a/cmake/fc.cmake +++ b/cmake/fc.cmake @@ -44,10 +44,7 @@ endif () if (${F_COMPILER} STREQUAL "GFORTRAN") set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_GFORT") - # ensure reentrancy of lapack codes set(FCOMMON_OPT "${FCOMMON_OPT} -Wall -frecursive") - # work around ABI violation in passing string arguments from C - set(FCOMMON_OPT "${FCOMMON_OPT} -fno-optimize-sibling-calls") #Don't include -lgfortran, when NO_LAPACK=1 or lsbcc if (NOT NO_LAPACK) set(EXTRALIB "{EXTRALIB} -lgfortran") diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 9b238f004..fad84de51 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -1,7 +1,7 @@ # helper functions for the kernel CMakeLists.txt -# Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. +# Set the default filenames for L1 objects. Most of these will be overriden by the appropriate KERNEL file. macro(SetDefaultL1) set(SAMAXKERNEL amax.S) set(DAMAXKERNEL amax.S) @@ -107,12 +107,6 @@ macro(SetDefaultL1) set(DAXPBYKERNEL ../arm/axpby.c) set(CAXPBYKERNEL ../arm/zaxpby.c) set(ZAXPBYKERNEL ../arm/zaxpby.c) - set(SSUMKERNEL sum.S) - set(DSUMKERNEL sum.S) - set(CSUMKERNEL zsum.S) - set(ZSUMKERNEL zsum.S) - set(QSUMKERNEL sum.S) - set(XSUMKERNEL zsum.S) endmacro () macro(SetDefaultL2) @@ -168,4 +162,4 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -endmacro () +endmacro () \ No newline at end of file diff --git a/cmake/os.cmake b/cmake/os.cmake index 2d25e7aaa..1321ef619 100644 --- a/cmake/os.cmake +++ b/cmake/os.cmake @@ -8,11 +8,6 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") set(NO_EXPRECISION 1) endif () -if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") - set(EXTRALIB "${EXTRALIB} -lm") - set(NO_EXPRECISION 1) -endif () - if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") set(EXTRALIB "${EXTRALIB} -lm") endif () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index e508a46c2..a67c44bf5 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -59,9 +59,6 @@ set(FU "") if (APPLE OR (MSVC AND NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")) set(FU "_") endif() -if(MINGW AND NOT MINGW64) - set(FU "_") -endif() set(COMPILER_ID ${CMAKE_C_COMPILER_ID}) if (${COMPILER_ID} STREQUAL "GNU") @@ -85,11 +82,6 @@ endif () # f_check if (NOT NOFORTRAN) include("${PROJECT_SOURCE_DIR}/cmake/f_check.cmake") -else () - file(APPEND ${TARGET_CONF_TEMP} - "#define BUNDERSCORE _\n" - "#define NEEDBUNDERSCORE 1\n") - set(BU "_") endif () # Cannot run getarch on target if we are cross-compiling diff --git a/cmake/system.cmake b/cmake/system.cmake index 1c2093efe..4cee7bd18 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -65,18 +65,6 @@ if (DEFINED TARGET) set(GETARCH_FLAGS "-DFORCE_${TARGET}") endif () -# On x86_64 build getarch with march=native. This is required to detect AVX512 support in getarch. -if (X86_64) - set(GETARCH_FLAGS "${GETARCH_FLAGS} -march=native") -endif () - -# On x86 no AVX support is available -if (X86 OR X86_64) -if ((DEFINED BINARY AND BINARY EQUAL 32) OR ("$CMAKE_SIZEOF_VOID_P}" EQUAL "4")) - set(GETARCH_FLAGS "${GETARCH_FLAGS} -DNO_AVX -DNO_AVX2 -DNO_AVX512") -endif () -endif () - if (INTERFACE64) message(STATUS "Using 64-bit integers.") set(GETARCH_FLAGS "${GETARCH_FLAGS} -DUSE64BITINT") @@ -148,16 +136,10 @@ endif () if (USE_THREAD) message(STATUS "Multi-threading enabled with ${NUM_THREADS} threads.") -else() - if (${USE_LOCKING}) - set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_LOCKING") - endif () endif () include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") -if (DEFINED BINARY) - message(STATUS "Compiling a ${BINARY}-bit binary.") -endif () + if (NOT DEFINED NEED_PIC) set(NEED_PIC 1) endif () @@ -174,9 +156,6 @@ include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") if (NOT NOFORTRAN) # Fortran Compiler dependent settings include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") -else () -set(NO_LAPACK 1) -set(NO_LAPACKE 1) endif () if (BINARY64) @@ -202,24 +181,12 @@ if (NEED_PIC) endif () if (DYNAMIC_ARCH) - if (X86 OR X86_64 OR ARM64 OR PPC) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") - if (DYNAMIC_OLDER) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") - endif () - else () - unset (DYNAMIC_ARCH) - message (STATUS "DYNAMIC_ARCH is not supported on the target architecture, removing") + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") + if (DYNAMIC_OLDER) + set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") endif () endif () -if (DYNAMIC_LIST) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") - foreach(DCORE ${DYNAMIC_LIST}) - set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") - endforeach () -endif () - if (NO_LAPACK) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") #Disable LAPACK C interface @@ -309,7 +276,7 @@ endif () set(KERNELDIR "${PROJECT_SOURCE_DIR}/kernel/${ARCH}") -# TODO: need to convert these Makefiles +# TODO: nead to convert these Makefiles # include ${PROJECT_SOURCE_DIR}/cmake/${ARCH}.cmake if (${CORE} STREQUAL "PPC440") diff --git a/cmake/system_check.cmake b/cmake/system_check.cmake index 94d3ba643..6b602c1b0 100644 --- a/cmake/system_check.cmake +++ b/cmake/system_check.cmake @@ -39,21 +39,13 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") set(MIPS64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(X86_64 1) - else() - set(X86 1) - endif() + set(X86_64 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") set(X86 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") set(ARM 1) elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") - if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") - set(ARM64 1) - else() - set(ARM 1) - endif() + set(ARM64 1) endif() if (X86_64) @@ -86,7 +78,7 @@ endif() if (X86_64 OR X86) file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include \n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") -execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) +execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) if (NO_AVX512 EQUAL 1) set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") endif() diff --git a/cmake/utils.cmake b/cmake/utils.cmake index fd93f8a70..28ef65f47 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -89,7 +89,7 @@ function(AllCombinations list_in absent_codes_in) set(CODES_OUT ${CODES_OUT} PARENT_SCOPE) endfunction () -# generates object files for each of the sources, using the BLAS naming scheme to pass the function name as a preprocessor definition +# generates object files for each of the sources, using the BLAS naming scheme to pass the funciton name as a preprocessor definition # @param sources_in the source files to build from # @param defines_in (optional) preprocessor definitions that will be applied to all objects # @param name_in (optional) if this is set this name will be used instead of the filename. Use a * to indicate where the float character should go, if no star the character will be prepended. diff --git a/common.h b/common.h index a9fe8d911..7fcd5e316 100644 --- a/common.h +++ b/common.h @@ -85,8 +85,6 @@ extern "C" { #if !defined(_MSC_VER) #include -#elif _MSC_VER < 1900 -#define snprintf _snprintf #endif #include @@ -131,7 +129,7 @@ extern "C" { #include #include #include -#if defined(SMP) || defined(USE_LOCKING) +#ifdef SMP #include #endif #endif @@ -200,7 +198,7 @@ extern "C" { #error "You can't specify both LOCK operation!" #endif -#if defined(SMP) || defined(USE_LOCKING) +#ifdef SMP #define USE_PTHREAD_LOCK #undef USE_PTHREAD_SPINLOCK #endif @@ -350,11 +348,6 @@ typedef int blasint; #endif #endif -#ifdef POWER9 -#ifndef YIELDING -#define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); -#endif -#endif /* #ifdef PILEDRIVER @@ -446,7 +439,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 typedef char env_var_t[MAX_PATH]; #define readenv(p, n) 0 #else -#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) +#ifdef OS_WINDOWS typedef char env_var_t[MAX_PATH]; #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) #else diff --git a/common_c.h b/common_c.h index 40ecf5b8b..ce0f2a5bd 100644 --- a/common_c.h +++ b/common_c.h @@ -19,7 +19,6 @@ #define CDOTC_K cdotc_k #define CNRM2_K cnrm2_k #define CSCAL_K cscal_k -#define CSUM_K csum_k #define CSWAP_K cswap_k #define CROT_K csrot_k @@ -250,7 +249,6 @@ #define CDOTC_K gotoblas -> cdotc_k #define CNRM2_K gotoblas -> cnrm2_k #define CSCAL_K gotoblas -> cscal_k -#define CSUM_K gotoblas -> csum_k #define CSWAP_K gotoblas -> cswap_k #define CROT_K gotoblas -> csrot_k diff --git a/common_d.h b/common_d.h index 94dc3eea8..ad9945186 100644 --- a/common_d.h +++ b/common_d.h @@ -19,7 +19,6 @@ #define DDOTC_K ddot_k #define DNRM2_K dnrm2_k #define DSCAL_K dscal_k -#define DSUM_K dsum_k #define DSWAP_K dswap_k #define DROT_K drot_k @@ -175,7 +174,6 @@ #define DDOTC_K gotoblas -> ddot_k #define DNRM2_K gotoblas -> dnrm2_k #define DSCAL_K gotoblas -> dscal_k -#define DSUM_K gotoblas -> dsum_k #define DSWAP_K gotoblas -> dswap_k #define DROT_K gotoblas -> drot_k diff --git a/common_interface.h b/common_interface.h index c350ac8ec..15f69e02f 100644 --- a/common_interface.h +++ b/common_interface.h @@ -122,13 +122,6 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); double BLASFUNC(dzasum)(blasint *, double *, blasint *); xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); -FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); -FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); -double BLASFUNC(dsum) (blasint *, double *, blasint *); -xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); -double BLASFUNC(dzsum)(blasint *, double *, blasint *); -xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); - blasint BLASFUNC(isamax)(blasint *, float *, blasint *); blasint BLASFUNC(idamax)(blasint *, double *, blasint *); blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); diff --git a/common_level1.h b/common_level1.h index 74cafb6db..32ffd6f18 100644 --- a/common_level1.h +++ b/common_level1.h @@ -100,13 +100,6 @@ float casum_k (BLASLONG, float *, BLASLONG); double zasum_k (BLASLONG, double *, BLASLONG); xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); -float ssum_k (BLASLONG, float *, BLASLONG); -double dsum_k (BLASLONG, double *, BLASLONG); -xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); -float csum_k (BLASLONG, float *, BLASLONG); -double zsum_k (BLASLONG, double *, BLASLONG); -xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); - float samax_k (BLASLONG, float *, BLASLONG); double damax_k (BLASLONG, double *, BLASLONG); xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index d2503aa65..15ba6f9db 100644 --- a/common_macro.h +++ b/common_macro.h @@ -66,7 +66,6 @@ #define DOTC_K QDOTC_K #define NRM2_K QNRM2_K #define SCAL_K QSCAL_K -#define SUM_K QSUM_K #define SWAP_K QSWAP_K #define ROT_K QROT_K @@ -357,7 +356,6 @@ #define DOTC_K DDOTC_K #define NRM2_K DNRM2_K #define SCAL_K DSCAL_K -#define SUM_K DSUM_K #define SWAP_K DSWAP_K #define ROT_K DROT_K @@ -660,7 +658,6 @@ #define DOTC_K SDOTC_K #define NRM2_K SNRM2_K #define SCAL_K SSCAL_K -#define SUM_K SSUM_K #define SWAP_K SSWAP_K #define ROT_K SROT_K @@ -965,7 +962,6 @@ #define DOTC_K XDOTC_K #define NRM2_K XNRM2_K #define SCAL_K XSCAL_K -#define SUM_K XSUM_K #define SWAP_K XSWAP_K #define ROT_K XROT_K @@ -1367,7 +1363,6 @@ #define DOTC_K ZDOTC_K #define NRM2_K ZNRM2_K #define SCAL_K ZSCAL_K -#define SUM_K ZSUM_K #define SWAP_K ZSWAP_K #define ROT_K ZROT_K @@ -1790,7 +1785,6 @@ #define DOTC_K CDOTC_K #define NRM2_K CNRM2_K #define SCAL_K CSCAL_K -#define SUM_K CSUM_K #define SWAP_K CSWAP_K #define ROT_K CROT_K diff --git a/common_param.h b/common_param.h index 574d5e176..8f162c01f 100644 --- a/common_param.h +++ b/common_param.h @@ -63,7 +63,6 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); - float (*ssum_k) (BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -155,7 +154,6 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); - double (*dsum_k) (BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); @@ -247,7 +245,6 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); @@ -335,7 +332,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); - float (*csum_k) (BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -499,7 +495,6 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); double (*znrm2_k) (BLASLONG, double *, BLASLONG); double (*zasum_k) (BLASLONG, double *, BLASLONG); - double (*zsum_k) (BLASLONG, double *, BLASLONG); int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -665,7 +660,6 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); - xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); diff --git a/common_power.h b/common_power.h index 5e15b7554..e3a1a7aef 100644 --- a/common_power.h +++ b/common_power.h @@ -39,7 +39,7 @@ #ifndef COMMON_POWER #define COMMON_POWER -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #define MB __asm__ __volatile__ ("eieio":::"memory") #define WMB __asm__ __volatile__ ("eieio":::"memory") #else @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define HAVE_PREFETCH #endif -#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && ( defined(OS_DARWIN) || defined(OS_FREEBSD) ) ) +#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) #define DCBT_ARG 0 #else #define DCBT_ARG 8 @@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #define L1_PREFETCH dcbtst #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #define L1_DUALFETCH #define L1_PREFETCHSIZE (16 + 128 * 100) #define L1_PREFETCH dcbtst @@ -499,7 +499,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ #if defined(ASSEMBLER) && !defined(NEEDPARAM) -#if defined(OS_LINUX) || defined(OS_FREEBSD) +#ifdef OS_LINUX #ifndef __64BIT__ #define PROLOGUE \ .section .text;\ @@ -598,14 +598,9 @@ REALNAME:;\ #ifndef __64BIT__ #define PROLOGUE \ .machine "any";\ - .toc;\ .globl .REALNAME;\ - .globl REALNAME;\ - .csect REALNAME[DS],3;\ -REALNAME:;\ - .long .REALNAME, TOC[tc0], 0;\ .csect .text[PR],5;\ -.REALNAME: +.REALNAME:; #define EPILOGUE \ _section_.text:;\ @@ -616,14 +611,9 @@ _section_.text:;\ #define PROLOGUE \ .machine "any";\ - .toc;\ .globl .REALNAME;\ - .globl REALNAME;\ - .csect REALNAME[DS],3;\ -REALNAME:;\ - .llong .REALNAME, TOC[tc0], 0;\ .csect .text[PR], 5;\ -.REALNAME: +.REALNAME:; #define EPILOGUE \ _section_.text:;\ @@ -784,7 +774,7 @@ Lmcount$lazy_ptr: #define HALT mfspr r0, 1023 -#if defined(OS_LINUX) || defined(OS_FREEBSD) +#ifdef OS_LINUX #if defined(PPC440) || defined(PPC440FP2) #undef MAX_CPU_NUMBER #define MAX_CPU_NUMBER 1 @@ -812,7 +802,7 @@ Lmcount$lazy_ptr: #define BUFFER_SIZE ( 2 << 20) #elif defined(PPC440FP2) #define BUFFER_SIZE ( 16 << 20) -#elif defined(POWER8) || defined(POWER9) +#elif defined(POWER8) #define BUFFER_SIZE ( 64 << 20) #else #define BUFFER_SIZE ( 16 << 20) @@ -829,7 +819,7 @@ Lmcount$lazy_ptr: #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) +#ifdef OS_LINUX #ifndef __64BIT__ #define FRAMESLOT(X) (((X) * 4) + 8) #else diff --git a/common_q.h b/common_q.h index b4ace3a62..30ad3727a 100644 --- a/common_q.h +++ b/common_q.h @@ -19,7 +19,6 @@ #define QDOTC_K qdot_k #define QNRM2_K qnrm2_k #define QSCAL_K qscal_k -#define QSUM_K qsum_k #define QSWAP_K qswap_k #define QROT_K qrot_k @@ -162,7 +161,6 @@ #define QDOTC_K gotoblas -> qdot_k #define QNRM2_K gotoblas -> qnrm2_k #define QSCAL_K gotoblas -> qscal_k -#define QSUM_K gotoblas -> qsum_k #define QSWAP_K gotoblas -> qswap_k #define QROT_K gotoblas -> qrot_k diff --git a/common_s.h b/common_s.h index 23c432f7c..3c1600859 100644 --- a/common_s.h +++ b/common_s.h @@ -12,7 +12,6 @@ #define ISMAX_K ismax_k #define ISMIN_K ismin_k #define SASUM_K sasum_k -#define SSUM_K ssum_k #define SAXPYU_K saxpy_k #define SAXPYC_K saxpy_k #define SCOPY_K scopy_k @@ -171,7 +170,6 @@ #define ISMAX_K gotoblas -> ismax_k #define ISMIN_K gotoblas -> ismin_k #define SASUM_K gotoblas -> sasum_k -#define SSUM_K gotoblas -> ssum_k #define SAXPYU_K gotoblas -> saxpy_k #define SAXPYC_K gotoblas -> saxpy_k #define SCOPY_K gotoblas -> scopy_k diff --git a/common_stackalloc.h b/common_stackalloc.h index d3d54669c..ec0fa1611 100644 --- a/common_stackalloc.h +++ b/common_stackalloc.h @@ -45,7 +45,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * SIZE must be carefully chosen to be: * - as small as possible to maximize the number of stack allocation * - large enough to support all architectures and kernel - * Choosing a SIZE too small will lead to a stack smashing. + * Chosing a too small SIZE will lead to a stack smashing. */ #define STACK_ALLOC(SIZE, TYPE, BUFFER) \ /* make it volatile because some function (ex: dgemv_n.S) */ \ diff --git a/common_x.h b/common_x.h index 2ed525faa..03b98db4f 100644 --- a/common_x.h +++ b/common_x.h @@ -19,7 +19,6 @@ #define XDOTC_K xdotc_k #define XNRM2_K xnrm2_k #define XSCAL_K xscal_k -#define XSUM_K xsum_k #define XSWAP_K xswap_k #define XROT_K xqrot_k @@ -228,7 +227,6 @@ #define XDOTC_K gotoblas -> xdotc_k #define XNRM2_K gotoblas -> xnrm2_k #define XSCAL_K gotoblas -> xscal_k -#define XSUM_K gotoblas -> xsum_k #define XSWAP_K gotoblas -> xswap_k #define XROT_K gotoblas -> xqrot_k diff --git a/common_x86.h b/common_x86.h index 99adc9f5b..4f538c948 100644 --- a/common_x86.h +++ b/common_x86.h @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; #endif @@ -214,7 +214,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #endif #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimization for barcelona. +//Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index c05998d58..f27c1e9be 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -129,13 +129,12 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ *ecx=cpuinfo[2]; *edx=cpuinfo[3]; #else - __asm__ __volatile__("mov $0, %%ecx;" - "cpuid" + __asm__ __volatile__("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (op)); + : "0" (op), "c"(0)); #endif } @@ -211,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ y = blas_quick_divide_table[y]; - __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); + __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); return result; } @@ -277,7 +276,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER #if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) || defined(EXCAVATOR) -//Enable some optimization for barcelona. +//Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_z.h b/common_z.h index f1e78dd08..b4f58bb0c 100644 --- a/common_z.h +++ b/common_z.h @@ -19,7 +19,6 @@ #define ZDOTC_K zdotc_k #define ZNRM2_K znrm2_k #define ZSCAL_K zscal_k -#define ZSUM_K zsum_k #define ZSWAP_K zswap_k #define ZROT_K zdrot_k @@ -250,7 +249,6 @@ #define ZDOTC_K gotoblas -> zdotc_k #define ZNRM2_K gotoblas -> znrm2_k #define ZSCAL_K gotoblas -> zscal_k -#define ZSUM_K gotoblas -> zsum_k #define ZSWAP_K gotoblas -> zswap_k #define ZROT_K gotoblas -> zdrot_k diff --git a/cpp_thread_test/Makefile b/cpp_thread_test/Makefile deleted file mode 100644 index 81e3470ef..000000000 --- a/cpp_thread_test/Makefile +++ /dev/null @@ -1,14 +0,0 @@ -include ../Makefile.rule - -all :: dgemv_tester dgemm_tester - -dgemv_tester : - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester - ./dgemv_tester - -dgemm_tester : dgemv_tester - $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester - ./dgemm_tester - -clean :: - rm -f dgemv_tester dgemm_tester diff --git a/cpp_thread_test/cpp_thread_safety_common.h b/cpp_thread_test/cpp_thread_safety_common.h deleted file mode 100644 index 60ab5bb2f..000000000 --- a/cpp_thread_test/cpp_thread_safety_common.h +++ /dev/null @@ -1,55 +0,0 @@ -inline void pauser(){ - /// a portable way to pause a program - std::string dummy; - std::cout << "Press enter to continue..."; - std::getline(std::cin, dummy); -} - -void FillMatrices(std::vector>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ - for(uint32_t i=0; i(randomMatSize*randomMatSize); j++){ - matBlock[i][j] = rngdist(PRNG); - } - } - for(uint32_t i=numMat; i<(numConcurrentThreads*numMat); i+=numMat){ - for(uint32_t j=0; j>& vecBlock, std::mt19937_64& PRNG, std::uniform_real_distribution& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numVec){ - for(uint32_t i=0; i(randomMatSize); j++){ - vecBlock[i][j] = rngdist(PRNG); - } - } - for(uint32_t i=numVec; i<(numConcurrentThreads*numVec); i+=numVec){ - for(uint32_t j=0; j rngdist{-1.0, 1.0}; - //make sure the internal state of the PRNG is properly mixed by generating 10M random numbers - //PRNGs often have unreliable distribution uniformity and other statistical properties before their internal state is sufficiently mixed - for (uint32_t i=0;i<10000000;i++) rngdist(PRNG); - return PRNG; -} - -void PrintMatrices(const std::vector>& matBlock, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ - for (uint32_t i=0;i(randomMatSize); j++){ - for (uint32_t k = 0; k < static_cast(randomMatSize); k++){ - std::cout< -#include -#include -#include -#include -#include "../cblas.h" -#include "cpp_thread_safety_common.h" - -void launch_cblas_dgemm(double* A, double* B, double* C, const blasint randomMatSize){ - cblas_dgemm(CblasColMajor, CblasNoTrans, CblasNoTrans, randomMatSize, randomMatSize, randomMatSize, 1.0, A, randomMatSize, B, randomMatSize, 0.1, C, randomMatSize); -} - -int main(int argc, char* argv[]){ - blasint randomMatSize = 1024; //dimension of the random square matrices used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested - uint32_t numTestRounds = 16; //number of testing rounds before success exit - - if (argc > 4){ - std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; - for (int i = 1; i < argc; i++){ - cliArgs.push_back(argv[i]); - std::cout< rngdist{-1.0, 1.0}; - std::vector> matBlock(numConcurrentThreads*3); - std::vector> futureBlock(numConcurrentThreads); - - std::cout<<"*----------------------------*\n"; - std::cout<<"| DGEMM thread safety tester |\n"; - std::cout<<"*----------------------------*\n"; - std::cout<<"Size of random matrices(N=M=K): "<(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize*randomMatSize); j++){ - if (std::abs(matBlock[i+2][j] - matBlock[2][j]) > 1.0E-13){ //i+2 is the index of matrix C, for a given thread - std::cout<<"ERROR: one of the threads returned a different result! Index : "< -#include -#include -#include -#include -#include "../cblas.h" -#include "cpp_thread_safety_common.h" - -void launch_cblas_dgemv(double* A, double* x, double* y, const blasint randomMatSize){ - const blasint inc = 1; - cblas_dgemv(CblasColMajor, CblasNoTrans, randomMatSize, randomMatSize, 1.0, A, randomMatSize, x, inc, 0.1, y, inc); -} - -int main(int argc, char* argv[]){ - blasint randomMatSize = 1024; //dimension of the random square matrices and vectors being used - uint32_t numConcurrentThreads = 52; //number of concurrent calls of the functions being tested - uint32_t numTestRounds = 16; //number of testing rounds before success exit - - if (argc > 4){ - std::cout<<"ERROR: too many arguments for thread safety tester"< cliArgs; - for (int i = 1; i < argc; i++){ - cliArgs.push_back(argv[i]); - std::cout< rngdist{-1.0, 1.0}; - std::vector> matBlock(numConcurrentThreads); - std::vector> vecBlock(numConcurrentThreads*2); - std::vector> futureBlock(numConcurrentThreads); - - std::cout<<"*----------------------------*\n"; - std::cout<<"| DGEMV thread safety tester |\n"; - std::cout<<"*----------------------------*\n"; - std::cout<<"Size of random matrices and vectors(N=M): "<(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast(randomMatSize)*numConcurrentThreads*8*2))/static_cast(1024*1024)<<" MiB of RAM\n"<(randomMatSize); j++){ - if (std::abs(vecBlock[i+1][j] - vecBlock[1][j]) > 1.0E-13){ //i+1 is the index of vector y, for a given thread - std::cout<<"ERROR: one of the threads returned a different result! Index : "<> 16 ) { case 0x4e: // POWER9 - return CPUTYPE_POWER9; + return CPUTYPE_POWER8; break; case 0x4d: case 0x4b: // POWER8/8E diff --git a/cpuid_x86.c b/cpuid_x86.c index 884d4b78a..c45ddd968 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1359,8 +1359,6 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 12: // Apollo Lake - case 15: - // Denverton return CPUTYPE_NEHALEM; } break; @@ -1378,9 +1376,9 @@ int get_cpuname(void){ } break; case 9: - case 8: + case 8: switch (model) { - case 14: // Kaby Lake and refreshes + case 14: // Kaby Lake if(support_avx2()) return CPUTYPE_HASWELL; if(support_avx()) diff --git a/cpuid_zarch.c b/cpuid_zarch.c index 896ed94f5..e0d9221f3 100644 --- a/cpuid_zarch.c +++ b/cpuid_zarch.c @@ -27,9 +27,9 @@ #include -#define CPU_GENERIC 0 -#define CPU_Z13 1 -#define CPU_Z14 2 +#define CPU_GENERIC 0 +#define CPU_Z13 1 +#define CPU_Z14 2 static char *cpuname[] = { "ZARCH_GENERIC", @@ -64,8 +64,10 @@ int detect(void) if (strstr(p, "2964")) return CPU_Z13; if (strstr(p, "2965")) return CPU_Z13; - if (strstr(p, "3906")) return CPU_Z14; - if (strstr(p, "3907")) return CPU_Z14; + + /* detect z14, but fall back to z13 */ + if (strstr(p, "3906")) return CPU_Z13; + if (strstr(p, "3907")) return CPU_Z13; return CPU_GENERIC; } @@ -114,14 +116,7 @@ void get_cpuconfig(void) break; case CPU_Z14: printf("#define Z14\n"); - printf("#define L1_DATA_SIZE 131072\n"); - printf("#define L1_DATA_LINESIZE 256\n"); - printf("#define L1_DATA_ASSOCIATIVE 8\n"); - printf("#define L2_SIZE 4194304\n"); - printf("#define L2_LINESIZE 256\n"); - printf("#define L2_ASSOCIATIVE 8\n"); printf("#define DTB_DEFAULT_ENTRIES 64\n"); - printf("#define DTB_SIZE 4096\n"); break; } } diff --git a/ctest.c b/ctest.c index 5e869b901..0571e9e02 100644 --- a/ctest.c +++ b/ctest.c @@ -113,7 +113,7 @@ ARCH_X86 ARCH_X86_64 #endif -#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) +#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) ARCH_POWER #endif diff --git a/ctest/c_cblat1.f b/ctest/c_cblat1.f index 1a123d74d..c741ce506 100644 --- a/ctest/c_cblat1.f +++ b/ctest/c_cblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_dblat1.f b/ctest/c_dblat1.f index 4a71b4dcf..c570a9140 100644 --- a/ctest/c_dblat1.f +++ b/ctest/c_dblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_sblat1.f b/ctest/c_sblat1.f index 89902f12d..773787d6f 100644 --- a/ctest/c_sblat1.f +++ b/ctest/c_sblat1.f @@ -653,7 +653,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/ctest/c_zblat1.f b/ctest/c_zblat1.f index cd0c8541d..03753e782 100644 --- a/ctest/c_zblat1.f +++ b/ctest/c_zblat1.f @@ -577,7 +577,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/driver/level2/trmv_thread.c b/driver/level2/trmv_thread.c index 43eeb40d2..24b881a93 100644 --- a/driver/level2/trmv_thread.c +++ b/driver/level2/trmv_thread.c @@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; @@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu range_m[num_cpu + 1] = range_m[num_cpu] + width; range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); - if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; + if (range_n[num_cpu] > m) range_n[num_cpu] = m; queue[num_cpu].mode = mode; queue[num_cpu].routine = trmv_kernel; diff --git a/driver/others/Makefile b/driver/others/Makefile index d4b5c26d5..3dc2e7c1b 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -18,12 +18,8 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) COMMONOBJS += dynamic_arm64.$(SUFFIX) else -ifeq ($(ARCH),power) -COMMONOBJS += dynamic_power.$(SUFFIX) -else COMMONOBJS += dynamic.$(SUFFIX) endif -endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -82,12 +78,8 @@ ifeq ($(DYNAMIC_ARCH), 1) ifeq ($(ARCH),arm64) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) else -ifeq ($(ARCH),power) -HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) -else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif -endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 6f4e20610..e5db1804f 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -109,7 +109,7 @@ extern unsigned int openblas_thread_timeout(); /* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when */ /* jobs is queued. */ -/* We need this global for checking if initialization is finished. */ +/* We need this grobal for cheking if initialization is finished. */ int blas_server_avail __attribute__((aligned(ATTRIBUTE_SIZE))) = 0; /* Local Variables */ @@ -150,8 +150,8 @@ static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT)); #ifdef MONITOR -/* Monitor is a function to see thread's status for every second. */ -/* Usually it turns off and it's for debugging. */ +/* Monitor is a function to see thread's status for every seconds. */ +/* Usually it turns off and it's for debugging. */ static pthread_t monitor_thread; static int main_status[MAX_CPU_NUMBER]; diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index bace54a23..bae344c59 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -50,7 +50,7 @@ /* This is a thread implementation for Win32 lazy implementation */ -/* Thread server common information */ +/* Thread server common infomation */ typedef struct{ CRITICAL_SECTION lock; HANDLE filled; @@ -61,7 +61,7 @@ typedef struct{ } blas_pool_t; -/* We need this global for checking if initialization is finished. */ +/* We need this global for cheking if initialization is finished. */ int blas_server_avail = 0; /* Local Variables */ @@ -461,18 +461,13 @@ int BLASFUNC(blas_thread_shutdown)(void){ SetEvent(pool.killed); for(i = 0; i < blas_num_threads - 1; i++){ - // Could also just use WaitForMultipleObjects WaitForSingleObject(blas_threads[i], 5); //INFINITE); #ifndef OS_WINDOWSSTORE // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP TerminateThread(blas_threads[i],0); #endif - CloseHandle(blas_threads[i]); } - CloseHandle(pool.filled); - CloseHandle(pool.killed); - blas_server_avail = 0; } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 045fc65b8..99c9254ac 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -322,7 +322,7 @@ int support_avx2(){ } int support_avx512(){ -#if !defined(NO_AVX) && !defined(NO_AVX512) +#ifndef NO_AVX512 int eax, ebx, ecx, edx; int ret=0; @@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){ return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } - //Apollo Lake or Denverton - if (model == 12 || model == 15) { + //Apollo Lake + if (model == 12) { return &gotoblas_NEHALEM; } return NULL; diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c deleted file mode 100644 index 0c4a87a5e..000000000 --- a/driver/others/dynamic_power.c +++ /dev/null @@ -1,102 +0,0 @@ - -#include "common.h" - -extern gotoblas_t gotoblas_POWER6; -extern gotoblas_t gotoblas_POWER8; -extern gotoblas_t gotoblas_POWER9; - -extern void openblas_warning(int verbose, const char *msg); - -static char *corename[] = { - "unknown", - "POWER6", - "POWER8", - "POWER9" -}; - -#define NUM_CORETYPES 4 - -char *gotoblas_corename(void) { - if (gotoblas == &gotoblas_POWER6) return corename[1]; - if (gotoblas == &gotoblas_POWER8) return corename[2]; - if (gotoblas == &gotoblas_POWER9) return corename[3]; - return corename[0]; -} - -static gotoblas_t *get_coretype(void) { - - if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) - return &gotoblas_POWER6; - if (__builtin_cpu_is("power8")) - return &gotoblas_POWER8; - if (__builtin_cpu_is("power9")) - return &gotoblas_POWER9; - return NULL; -} - -static gotoblas_t *force_coretype(char * coretype) { - - int i ; - int found = -1; - char message[128]; - - for ( i = 0 ; i < NUM_CORETYPES; i++) - { - if (!strncasecmp(coretype, corename[i], 20)) - { - found = i; - break; - } - } - - switch (found) - { - case 1: return (&gotoblas_POWER6); - case 2: return (&gotoblas_POWER8); - case 3: return (&gotoblas_POWER9); - default: return NULL; - } - snprintf(message, 128, "Core not found: %s\n", coretype); - openblas_warning(1, message); -} - -void gotoblas_dynamic_init(void) { - - char coremsg[128]; - char coren[22]; - char *p; - - - if (gotoblas) return; - - p = getenv("OPENBLAS_CORETYPE"); - if ( p ) - { - gotoblas = force_coretype(p); - } - else - { - gotoblas = get_coretype(); - } - - if (gotoblas == NULL) - { - snprintf(coremsg, 128, "Falling back to POWER8 core\n"); - openblas_warning(1, coremsg); - gotoblas = &gotoblas_POWER8; - } - - if (gotoblas && gotoblas -> init) { - strncpy(coren,gotoblas_corename(),20); - sprintf(coremsg, "Core: %s\n",coren); - openblas_warning(2, coremsg); - gotoblas -> init(); - } else { - openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); - exit(1); - } -} - -void gotoblas_dynamic_quit(void) { - gotoblas = NULL; -} diff --git a/driver/others/init.c b/driver/others/init.c index a29dce971..012ef6647 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -765,7 +765,7 @@ int gotoblas_set_affinity(int pos) { int mynode = 1; - /* if number of threads is larger than initial condition */ + /* if number of threads is larger than inital condition */ if (pos < 0) { sched_setaffinity(0, sizeof(cpu_orig_mask), &cpu_orig_mask[0]); return 0; @@ -857,14 +857,7 @@ void gotoblas_affinity_init(void) { common -> shmid = pshmid; if (common -> magic != SH_MAGIC) { - -#if defined(__GLIBC_PREREQ) -#if __GLIBC_PREREQ(2, 7) cpu_set_t *cpusetp; -#else - cpu_set_t cpuset; -#endif -#endif int nums; int ret; @@ -897,7 +890,7 @@ void gotoblas_affinity_init(void) { } CPU_FREE(cpusetp); #else - ret = sched_getaffinity(0,sizeof(cpu_set_t), &cpuset); + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) { common->num_procs = nums; } else { @@ -905,11 +898,11 @@ void gotoblas_affinity_init(void) { int i; int n = 0; for (i=0;inum_procs = n; } #else - common->num_procs = CPU_COUNT(&cpuset); + common->num_procs = CPU_COUNT(sizeof(cpu_set_t),cpusetp); } #endif diff --git a/driver/others/memory.c b/driver/others/memory.c index f67cb01f4..72d3e173c 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -198,68 +198,45 @@ int get_num_procs(void); #else int get_num_procs(void) { static int nums = 0; - cpu_set_t cpuset,*cpusetp; - size_t size; - int ret; - -#if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2, 7) - int i; -#if !__GLIBC_PREREQ(2, 6) - int n; -#endif -#endif -#endif +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i= CPU_SETSIZE) { - cpusetp = CPU_ALLOC(nums); - if (cpusetp == NULL) { - return nums; - } - size = CPU_ALLOC_SIZE(nums); - ret = sched_getaffinity(0,size,cpusetp); - if (ret!=0) { - CPU_FREE(cpusetp); - return nums; - } - ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; - } else { - ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); - if (ret!=0) { - return nums; - } - ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; - return nums; - } + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) return nums; + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) return nums; + ret = CPU_COUNT_S(size,cpusetp); + if (ret > 0 && ret < nums) nums = ret; + CPU_FREE(cpusetp); + return nums; #endif #endif } @@ -1313,13 +1290,6 @@ void blas_memory_free_nolock(void * map_address) { free(map_address); } -#ifdef SMP -void blas_thread_memory_cleanup(void) { - blas_memory_cleanup((void*)get_memory_table()); -} -#endif - - void blas_shutdown(void){ #ifdef SMP BLASFUNC(blas_thread_shutdown)(); @@ -1329,7 +1299,7 @@ void blas_shutdown(void){ /* Only cleanupIf we were built for threading and TLS was initialized */ if (local_storage_key) #endif - blas_thread_memory_cleanup(); + blas_memory_cleanup((void*)get_memory_table()); #ifdef SEEK_ADDRESS base_address = 0UL; @@ -1559,7 +1529,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser break; case DLL_THREAD_DETACH: #if defined(SMP) - blas_thread_memory_cleanup(); + blas_memory_cleanup((void*)get_memory_table()); #endif break; case DLL_PROCESS_DETACH: @@ -1622,7 +1592,6 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); -#if __PGIC__ < 19 #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -1630,16 +1599,13 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif -#endif } #endif #else -/* USE_TLS / COMPILE_TLS not set */ - #include -#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) +#ifdef OS_WINDOWS #define ALLOC_WINDOWS #ifndef MEM_LARGE_PAGES #define MEM_LARGE_PAGES 0x20000000 @@ -1653,7 +1619,7 @@ void gotoblas_dummy_for_PGI(void) { #include #include -#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) +#ifndef OS_WINDOWS #include #ifndef NO_SYSV_IPC #include @@ -1673,7 +1639,7 @@ void gotoblas_dummy_for_PGI(void) { #include #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #include #endif @@ -1712,12 +1678,9 @@ void gotoblas_dummy_for_PGI(void) { #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) #define CONSTRUCTOR __attribute__ ((constructor)) #define DESTRUCTOR __attribute__ ((destructor)) -#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) +#else #define CONSTRUCTOR __attribute__ ((constructor(101))) #define DESTRUCTOR __attribute__ ((destructor(101))) -#else -#define CONSTRUCTOR __attribute__ ((constructor)) -#define DESTRUCTOR __attribute__ ((destructor)) #endif #ifdef DYNAMIC_ARCH @@ -1741,70 +1704,45 @@ void goto_set_num_threads(int num_threads) {}; int get_num_procs(void); #else int get_num_procs(void) { - static int nums = 0; - cpu_set_t cpuset,*cpusetp; - size_t size; - int ret; - -#if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2, 7) - int i; -#if !__GLIBC_PREREQ(2, 6) - int n; -#endif -#endif -#endif +cpu_set_t *cpusetp; +size_t size; +int ret; +int i,n; if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); #if !defined(OS_LINUX) - return nums; + return nums; #endif #if !defined(__GLIBC_PREREQ) - return nums; + return nums; #else #if !__GLIBC_PREREQ(2, 3) - return nums; + return nums; #endif #if !__GLIBC_PREREQ(2, 7) - ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); + ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); if (ret!=0) return nums; n=0; #if !__GLIBC_PREREQ(2, 6) for (i=0;i= CPU_SETSIZE) { - cpusetp = CPU_ALLOC(nums); - if (cpusetp == NULL) { - return nums; - } - size = CPU_ALLOC_SIZE(nums); - ret = sched_getaffinity(0,size,cpusetp); - if (ret!=0) { - CPU_FREE(cpusetp); - return nums; - } - ret = CPU_COUNT_S(size,cpusetp); - if (ret > 0 && ret < nums) nums = ret; - CPU_FREE(cpusetp); - return nums; - } else { - ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); - if (ret!=0) { - return nums; - } - ret = CPU_COUNT(&cpuset); - if (ret > 0 && ret < nums) nums = ret; - return nums; - } + cpusetp = CPU_ALLOC(nums); + if (cpusetp == NULL) return nums; + size = CPU_ALLOC_SIZE(nums); + ret = sched_getaffinity(0,size,cpusetp); + if (ret!=0) return nums; + nums = CPU_COUNT_S(size,cpusetp); + CPU_FREE(cpusetp); + return nums; #endif #endif } @@ -1818,7 +1756,7 @@ int get_num_procs(void) { return nums; } #endif - + #ifdef OS_HAIKU int get_num_procs(void) { static int nums = 0; @@ -1855,7 +1793,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) +#if defined(OS_FREEBSD) int get_num_procs(void) { @@ -1932,7 +1870,7 @@ void openblas_fork_handler() // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 // In the mean time build with USE_OPENMP=0 or link against another // implementation of OpenMP. -#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) +#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) int err; err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); if(err != 0) @@ -1945,7 +1883,7 @@ extern int openblas_goto_num_threads_env(); extern int openblas_omp_num_threads_env(); int blas_get_cpu_number(void){ -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) int max_num; #endif int blas_goto_num = 0; @@ -1953,11 +1891,11 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) max_num = get_num_procs(); #endif - // blas_goto_num = 0; + blas_goto_num = 0; #ifndef USE_OPENMP blas_goto_num=openblas_num_threads_env(); if (blas_goto_num < 0) blas_goto_num = 0; @@ -1969,7 +1907,7 @@ int blas_get_cpu_number(void){ #endif - // blas_omp_num = 0; + blas_omp_num = 0; blas_omp_num=openblas_omp_num_threads_env(); if (blas_omp_num < 0) blas_omp_num = 0; @@ -1977,7 +1915,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif @@ -2064,15 +2002,11 @@ static void *alloc_mmap(void *address){ } if (map_address != (void *)-1) { -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif } #ifdef OS_LINUX @@ -2214,18 +2148,14 @@ static void *alloc_mmap(void *address){ #if defined(OS_LINUX) && !defined(NO_WARMUP) } #endif + LOCK_COMMAND(&alloc_lock); if (map_address != (void *)-1) { -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - LOCK_COMMAND(&alloc_lock); -#endif release_info[release_pos].address = map_address; release_info[release_pos].func = alloc_mmap_free; release_pos ++; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif } + UNLOCK_COMMAND(&alloc_lock); return map_address; } @@ -2593,7 +2523,7 @@ void *blas_memory_alloc(int procpos){ int position; #if defined(WHEREAMI) && !defined(USE_OPENMP) - int mypos = 0; + int mypos; #endif void *map_address; @@ -2624,11 +2554,6 @@ void *blas_memory_alloc(int procpos){ NULL, }; void *(**func)(void *address); - -#if defined(USE_OPENMP) - if (!memory_initialized) { -#endif - LOCK_COMMAND(&alloc_lock); if (!memory_initialized) { @@ -2664,9 +2589,6 @@ void *blas_memory_alloc(int procpos){ } UNLOCK_COMMAND(&alloc_lock); -#if defined(USE_OPENMP) - } -#endif #ifdef DEBUG printf("Alloc Start ...\n"); @@ -2681,17 +2603,13 @@ void *blas_memory_alloc(int procpos){ do { if (!memory[position].used && (memory[position].pos == mypos)) { -#if defined(SMP) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#else - blas_lock(&memory[position].lock); -#endif +// blas_lock(&memory[position].lock); + if (!memory[position].used) goto allocation; -#if defined(SMP) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif +// blas_unlock(&memory[position].lock); } position ++; @@ -2703,26 +2621,21 @@ void *blas_memory_alloc(int procpos){ position = 0; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif do { -#if defined(USE_OPENMP) - if (!memory[position].used) { - blas_lock(&memory[position].lock); -#endif +/* if (!memory[position].used) { */ +/* blas_lock(&memory[position].lock);*/ + if (!memory[position].used) goto allocation; -#if defined(USE_OPENMP) - blas_unlock(&memory[position].lock); - } -#endif +/* blas_unlock(&memory[position].lock);*/ +/* } */ + position ++; } while (position < NUM_BUFFERS); -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) - UNLOCK_COMMAND(&alloc_lock); -#endif + UNLOCK_COMMAND(&alloc_lock); + goto error; allocation : @@ -2732,11 +2645,10 @@ void *blas_memory_alloc(int procpos){ #endif memory[position].used = 1; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) + UNLOCK_COMMAND(&alloc_lock); -#else - blas_unlock(&memory[position].lock); -#endif +/* blas_unlock(&memory[position].lock);*/ + if (!memory[position].addr) { do { #ifdef DEBUG @@ -2753,7 +2665,7 @@ void *blas_memory_alloc(int procpos){ #ifdef ALLOC_DEVICEDRIVER if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { - fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); + fprintf(stderr, "OpenBLAS Warning ... Physically contigous allocation was failed.\n"); } #endif @@ -2781,13 +2693,9 @@ void *blas_memory_alloc(int procpos){ } while ((BLASLONG)map_address == -1); -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif memory[position].addr = map_address; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); @@ -2841,9 +2749,8 @@ void blas_memory_free(void *free_area){ #endif position = 0; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) LOCK_COMMAND(&alloc_lock); -#endif + while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) position++; @@ -2857,9 +2764,7 @@ void blas_memory_free(void *free_area){ WMB; memory[position].used = 0; -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif #ifdef DEBUG printf("Unmap Succeeded.\n\n"); @@ -2874,9 +2779,8 @@ void blas_memory_free(void *free_area){ for (position = 0; position < NUM_BUFFERS; position++) printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); #endif -#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) UNLOCK_COMMAND(&alloc_lock); -#endif + return; } @@ -2926,7 +2830,7 @@ void blas_shutdown(void){ #if defined(OS_LINUX) && !defined(NO_WARMUP) -#if defined(SMP) || defined(USE_LOCKING) +#ifdef SMP #if defined(USE_PTHREAD_LOCK) static pthread_mutex_t init_lock = PTHREAD_MUTEX_INITIALIZER; #elif defined(USE_PTHREAD_SPINLOCK) @@ -2951,7 +2855,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, if (hot_alloc != 2) { #endif -#if defined(SMP) || defined(USE_LOCKING) +#ifdef SMP LOCK_COMMAND(&init_lock); #endif @@ -2961,7 +2865,7 @@ static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n, size -= PAGESIZE; } -#if defined(SMP) || defined(USE_LOCKING) +#ifdef SMP UNLOCK_COMMAND(&init_lock); #endif @@ -3194,7 +3098,7 @@ void gotoblas_dummy_for_PGI(void) { gotoblas_init(); gotoblas_quit(); -#if __PGIC__ < 19 + #if 0 asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text"); asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text"); @@ -3202,7 +3106,6 @@ void gotoblas_dummy_for_PGI(void) { asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text"); asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text"); #endif -#endif } #endif diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c index 81648fb7c..eca494dca 100644 --- a/driver/others/openblas_get_config.c +++ b/driver/others/openblas_get_config.c @@ -35,6 +35,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#if defined(_WIN32) && defined(_MSC_VER) +#if _MSC_VER < 1900 +#define snprintf _snprintf +#endif +#endif + static char* openblas_config_str="" "OpenBLAS " VERSION diff --git a/exports/Makefile b/exports/Makefile index b1348bd4a..3a5f77db3 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -141,14 +141,6 @@ else $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c endif - -ifeq ($(F_COMPILER), INTEL) - $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ - -Wl,--whole-archive $< -Wl,--no-whole-archive \ - -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) - $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. -else - ifneq ($(C_COMPILER), LSB) $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive $< -Wl,--no-whole-archive \ @@ -160,7 +152,6 @@ else -Wl,--whole-archive $< -Wl,--no-whole-archive \ -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. -endif endif rm -f linktest diff --git a/exports/dllinit.c b/exports/dllinit.c index 4a05c0e14..02ff092e9 100644 --- a/exports/dllinit.c +++ b/exports/dllinit.c @@ -40,25 +40,15 @@ void gotoblas_init(void); void gotoblas_quit(void); -#if defined(SMP) && defined(USE_TLS) -void blas_thread_memory_cleanup(void); -#endif BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { - switch(reason) { - case DLL_PROCESS_ATTACH: - gotoblas_init(); - break; - case DLL_PROCESS_DETACH: - gotoblas_quit(); - break; - case DLL_THREAD_ATTACH: - break; - case DLL_THREAD_DETACH: -#if defined(SMP) && defined(USE_TLS) - blas_thread_memory_cleanup(); -#endif - break; + + if (reason == DLL_PROCESS_ATTACH) { + gotoblas_init(); + } + + if (reason == DLL_PROCESS_DETACH) { + gotoblas_quit(); } return TRUE; diff --git a/f_check b/f_check index b05db85bd..34caa00be 100644 --- a/f_check +++ b/f_check @@ -125,7 +125,7 @@ if ($compiler eq "") { $openmp = "-openmp"; } - # for embedded underscore name, e.g. zho_ge, it may append 2 underscores. + # for embeded underscore name, e.g. zho_ge, it may append 2 underscores. $data = `$compiler -O2 -S ftest3.f > /dev/null 2>&1 && cat ftest3.s && rm -f ftest3.s`; if ($data =~ / zho_ge__/) { $need2bu = 1; diff --git a/getarch.c b/getarch.c index 4d960356c..d03ce6e98 100644 --- a/getarch.c +++ b/getarch.c @@ -637,18 +637,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "POWER8" #endif -#if defined(FORCE_POWER9) -#define FORCE -#define ARCHITECTURE "POWER" -#define SUBARCHITECTURE "POWER9" -#define SUBDIRNAME "power" -#define ARCHCONFIG "-DPOWER9 " \ - "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ - "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ - "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " -#define LIBNAME "power9" -#define CORENAME "POWER9" -#endif #ifdef FORCE_PPCG4 #define FORCE @@ -1077,23 +1065,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif -#ifdef FORCE_TSV110 -#define FORCE -#define ARCHITECTURE "ARM64" -#define SUBARCHITECTURE "TSV110" -#define SUBDIRNAME "arm64" -#define ARCHCONFIG "-DTSV110 " \ - "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ - "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ - "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ - "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ - "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" -#define LIBNAME "tsv110" -#define CORENAME "TSV110" -#else -#endif - - #ifdef FORCE_ZARCH_GENERIC #define FORCE #define ARCHITECTURE "ZARCH" @@ -1114,16 +1085,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "Z13" #endif -#ifdef FORCE_Z14 -#define FORCE -#define ARCHITECTURE "ZARCH" -#define SUBARCHITECTURE "Z14" -#define ARCHCONFIG "-DZ14 " \ - "-DDTB_DEFAULT_ENTRIES=64" -#define LIBNAME "z14" -#define CORENAME "Z14" -#endif - #ifndef FORCE #ifdef USER_TARGET diff --git a/interface/CMakeLists.txt b/interface/CMakeLists.txt index 5ea39f864..8b25344c0 100644 --- a/interface/CMakeLists.txt +++ b/interface/CMakeLists.txt @@ -12,7 +12,6 @@ set(BLAS1_REAL_ONLY_SOURCES rotm.c rotmg.c # N.B. these do not have complex counterparts rot.c asum.c - sum.c ) # these will have 'z' prepended for the complex version @@ -24,7 +23,7 @@ set(BLAS1_MANGLED_SOURCES axpby.c ) -# TODO: USE_NETLIB_GEMV should switch gemv.c to netlib/*gemv.f +# TODO: USE_NETLIB_GEMV shoudl switch gemv.c to netlib/*gemv.f # these all have 'z' sources for complex versions set(BLAS2_SOURCES gemv.c ger.c @@ -125,7 +124,6 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") - GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") endif () if (${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") @@ -134,7 +132,6 @@ foreach (float_type ${FLOAT_TYPES}) GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") - GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") endif () endforeach () diff --git a/interface/Makefile b/interface/Makefile index f0577796d..20ec74e9e 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -25,7 +25,7 @@ SBLAS1OBJS = \ saxpy.$(SUFFIX) sswap.$(SUFFIX) \ scopy.$(SUFFIX) sscal.$(SUFFIX) \ sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ - sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ + sasum.$(SUFFIX) snrm2.$(SUFFIX) \ smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ @@ -51,7 +51,7 @@ DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ dcopy.$(SUFFIX) dscal.$(SUFFIX) \ ddot.$(SUFFIX) \ - dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ + dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ @@ -76,7 +76,7 @@ CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ - scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ + scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ scamax.$(SUFFIX) icamax.$(SUFFIX) \ scamin.$(SUFFIX) icamin.$(SUFFIX) \ csrot.$(SUFFIX) crotg.$(SUFFIX) \ @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ - dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ + dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ dzamax.$(SUFFIX) izamax.$(SUFFIX) \ dzamin.$(SUFFIX) izamin.$(SUFFIX) \ zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ @@ -146,7 +146,7 @@ QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ qdot.$(SUFFIX) \ - qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -168,7 +168,7 @@ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ - qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION QBLAS1OBJS = \ qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ qcopy.$(SUFFIX) qscal.$(SUFFIX) \ - qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ + qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ @@ -224,7 +224,7 @@ QBLAS3OBJS = \ XBLAS1OBJS = \ xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ - qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ + qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ @@ -263,8 +263,7 @@ CSBLAS1OBJS = \ cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ - cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ - cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) + cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) CSBLAS2OBJS = \ cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ @@ -281,8 +280,7 @@ CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ - cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ - cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) + cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) CDBLAS2OBJS = \ cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ @@ -302,8 +300,7 @@ CCBLAS1OBJS = \ cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ - cblas_caxpby.$(SUFFIX) \ - cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) + cblas_caxpby.$(SUFFIX) CCBLAS2OBJS = \ cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ @@ -329,9 +326,7 @@ CZBLAS1OBJS = \ cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ - cblas_zaxpby.$(SUFFIX) \ - cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) - + cblas_zaxpby.$(SUFFIX) CZBLAS2OBJS = \ cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ @@ -565,24 +560,6 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -c $< -o $(@F) -ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - -qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -c $< -o $(@F) - snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c $(CC) $(CFLAGS) -c $< -o $(@F) @@ -1406,18 +1383,6 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) -cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) - -cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - -cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c - $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) - cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1430,18 +1395,6 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) -cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - -cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) - cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) @@ -1449,7 +1402,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c - $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) + $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) diff --git a/interface/axpy.c b/interface/axpy.c index eaa19f4df..9032946d2 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -91,7 +91,7 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small input size & + //Temporarily work-around the low performance issue with small imput size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/sum.c b/interface/sum.c deleted file mode 100644 index dfdcc5dcc..000000000 --- a/interface/sum.c +++ /dev/null @@ -1,97 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#include -#include "common.h" -#ifdef FUNCTION_PROFILE -#include "functable.h" -#endif - -#ifndef CBLAS - -FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ - - BLASLONG n = *N; - BLASLONG incx = *INCX; - FLOATRET ret; - - PRINT_DEBUG_NAME; - - if (n <= 0) return 0; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - - ret = (FLOATRET)SUM_K(n, x, incx); - - FUNCTION_PROFILE_END(COMPSIZE, n, n); - - IDEBUG_END; - - return ret; -} - -#else -#ifdef COMPLEX -FLOAT CNAME(blasint n, void *vx, blasint incx){ - FLOAT *x = (FLOAT*) vx; -#else -FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ -#endif - - FLOAT ret; - - PRINT_DEBUG_CNAME; - - if (n <= 0) return 0; - - IDEBUG_START; - - FUNCTION_PROFILE_START(); - - ret = SUM_K(n, x, incx); - - FUNCTION_PROFILE_END(COMPSIZE, n, n); - - IDEBUG_END; - - return ret; -} - -#endif diff --git a/interface/trmv.c b/interface/trmv.c index 2e52527a3..7c40ae976 100644 --- a/interface/trmv.c +++ b/interface/trmv.c @@ -218,8 +218,11 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, buffer = (FLOAT *)blas_memory_alloc(1); #ifdef SMP - nthreads = num_cpu_avail(2); +/* nthreads = num_cpu_avail(2); +FIXME trmv_thread was found to be broken, see issue 1332 */ + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/trsm.c b/interface/trsm.c index 715c83a1f..f2da285de 100644 --- a/interface/trsm.c +++ b/interface/trsm.c @@ -204,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, if (side < 0) info = 1; if (info != 0) { - BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); return; } diff --git a/interface/zaxpy.c b/interface/zaxpy.c index da3b48ead..dbd559628 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -99,7 +99,7 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in //disable multi-thread when incx==0 or incy==0 //In that case, the threads would be dependent. // - //Temporarily work-around the low performance issue with small input size & + //Temporarily work-around the low performance issue with small imput size & //multithreads. if (incx == 0 || incy == 0 || n <= MULTI_THREAD_MINIMAL) nthreads = 1; diff --git a/interface/ztrmv.c b/interface/ztrmv.c index 4c47e9e91..0e16632e0 100644 --- a/interface/ztrmv.c +++ b/interface/ztrmv.c @@ -239,6 +239,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, } else nthreads = 1; +/* FIXME TRMV multithreading appears to be broken, see issue 1332*/ + nthreads = 1; + if(nthreads > 1) { buffer_size = n > 16 ? 0 : n * 4 + 40; } diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index ad15b8f25..2a330df4e 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -65,7 +65,6 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) - GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) diff --git a/kernel/Makefile.L1 b/kernel/Makefile.L1 index 970703230..a8f9cf097 100644 --- a/kernel/Makefile.L1 +++ b/kernel/Makefile.L1 @@ -340,32 +340,6 @@ ifndef XSCALKERNEL XSCALKERNEL = zscal.S endif -### SUM ### - -ifndef SSUMKERNEL -SSUMKERNEL = sum.S -endif - -ifndef DSUMKERNEL -DSUMKERNEL = sum.S -endif - -ifndef CSUMKERNEL -CSUMKERNEL = zsum.S -endif - -ifndef ZSUMKERNEL -ZSUMKERNEL = zsum.S -endif - -ifndef QSUMKERNEL -QSUMKERNEL = sum.S -endif - -ifndef XSUMKERNEL -XSUMKERNEL = zsum.S -endif - ### SWAP ### ifndef SSWAPKERNEL @@ -479,7 +453,7 @@ endif SBLASOBJS += \ samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ - sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ + sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ saxpby_k$(TSUFFIX).$(SUFFIX) @@ -489,32 +463,31 @@ DBLASOBJS += \ idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ - daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) + daxpby_k$(TSUFFIX).$(SUFFIX) QBLASOBJS += \ qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ - qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ - qsum_k$(TSUFFIX).$(SUFFIX) + qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ - cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) + cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ - zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) + zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) XBLASOBJS += \ xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ - xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) + xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) ### AMAX ### @@ -644,7 +617,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ -### ASUM ### + $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ @@ -663,26 +636,6 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ -### SUM ### -$(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ - -$(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ - -$(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) - $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ - -$(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ - -$(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ - -$(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) - $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ - -### AXPY ### $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index f83def47b..9258f216d 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) USE_TRMM = 1 endif -ifeq ($(CORE), GENERIC) +ifeq ($(TARGET), GENERIC) USE_TRMM = 1 endif @@ -44,18 +44,10 @@ ifeq ($(CORE), POWER8) USE_TRMM = 1 endif -ifeq ($(CORE), POWER9) -USE_TRMM = 1 -endif - ifeq ($(ARCH), zarch) USE_TRMM = 1 endif -ifeq ($(CORE), Z14) -USE_TRMM = 1 -endif - diff --git a/kernel/alpha/sum.S b/kernel/alpha/sum.S deleted file mode 100644 index 3902817a7..000000000 --- a/kernel/alpha/sum.S +++ /dev/null @@ -1,206 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "version.h" - -#define PREFETCHSIZE 88 - -#define N $16 -#define X $17 -#define INCX $18 -#define I $19 - -#define s0 $f0 -#define s1 $f1 -#define s2 $f10 -#define s3 $f11 - -#define a0 $f12 -#define a1 $f13 -#define a2 $f14 -#define a3 $f15 -#define a4 $f16 -#define a5 $f17 -#define a6 $f18 -#define a7 $f19 - -#define t0 $f20 -#define t1 $f21 -#define t2 $f22 -#define t3 $f23 - - PROLOGUE - PROFCODE - - fclr s0 - unop - fclr t0 - ble N, $L999 - - sra N, 3, I - fclr s1 - fclr s2 - ble I, $L15 - - LD a0, 0 * SIZE(X) - fclr t1 - SXADDQ INCX, X, X - fclr t2 - - LD a1, 0 * SIZE(X) - fclr t3 - SXADDQ INCX, X, X - fclr s3 - - LD a2, 0 * SIZE(X) - SXADDQ INCX, X, X - LD a3, 0 * SIZE(X) - SXADDQ INCX, X, X - - LD a4, 0 * SIZE(X) - SXADDQ INCX, X, X - LD a5, 0 * SIZE(X) - SXADDQ INCX, X, X - - lda I, -1(I) - ble I, $L13 - .align 4 - -$L12: - ADD s0, t0, s0 - ldl $31, PREFETCHSIZE * 2 * SIZE(X) - fmov a0, t0 - lda I, -1(I) - - ADD s1, t1, s1 - LD a6, 0 * SIZE(X) - fmov a1, t1 - SXADDQ INCX, X, X - - ADD s2, t2, s2 - LD a7, 0 * SIZE(X) - fmov a2, t2 - SXADDQ INCX, X, X - - ADD s3, t3, s3 - LD a0, 0 * SIZE(X) - fmov a3, t3 - SXADDQ INCX, X, X - - ADD s0, t0, s0 - LD a1, 0 * SIZE(X) - fmov a4, t0 - SXADDQ INCX, X, X - - ADD s1, t1, s1 - LD a2, 0 * SIZE(X) - fmov a5, t1 - SXADDQ INCX, X, X - - ADD s2, t2, s2 - LD a3, 0 * SIZE(X) - fmov a6, t2 - SXADDQ INCX, X, X - - ADD s3, t3, s3 - LD a4, 0 * SIZE(X) - fmov a7, t3 - SXADDQ INCX, X, X - - LD a5, 0 * SIZE(X) - unop - SXADDQ INCX, X, X - bne I, $L12 - .align 4 - -$L13: - ADD s0, t0, s0 - LD a6, 0 * SIZE(X) - fmov a0, t0 - SXADDQ INCX, X, X - - ADD s1, t1, s1 - LD a7, 0 * SIZE(X) - fmov a1, t1 - SXADDQ INCX, X, X - - ADD s2, t2, s2 - fmov a2, t2 - ADD s3, t3, s3 - fmov a3, t3 - - ADD s0, t0, s0 - fmov a4, t0 - ADD s1, t1, s1 - fmov a5, t1 - ADD s2, t2, s2 - fmov a6, t2 - ADD s3, t3, s3 - fmov a7, t3 - - ADD s1, t1, s1 - ADD s2, t2, s2 - ADD s3, t3, s3 - - ADD s0, s1, s0 - ADD s2, s3, s2 - .align 4 - -$L15: - and N, 7, I - ADD s0, s2, s0 - unop - ble I, $L999 - .align 4 - -$L17: - ADD s0, t0, s0 - LD a0, 0 * SIZE(X) - SXADDQ INCX, X, X - fmov a0, t0 - - lda I, -1(I) - bne I, $L17 - .align 4 - -$L999: - ADD s0, t0, s0 - ret - EPILOGUE diff --git a/kernel/alpha/zsum.S b/kernel/alpha/zsum.S deleted file mode 100644 index 1ad0eb137..000000000 --- a/kernel/alpha/zsum.S +++ /dev/null @@ -1,208 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "version.h" - -#define PREFETCHSIZE 88 - -#define N $16 -#define X $17 -#define INCX $18 -#define I $19 - -#define s0 $f0 -#define s1 $f1 -#define s2 $f10 -#define s3 $f11 - -#define a0 $f12 -#define a1 $f13 -#define a2 $f14 -#define a3 $f15 -#define a4 $f16 -#define a5 $f17 -#define a6 $f18 -#define a7 $f19 - -#define t0 $f20 -#define t1 $f21 -#define t2 $f22 -#define t3 $f23 - - PROLOGUE - PROFCODE - - fclr s0 - unop - fclr t0 - addq INCX, INCX, INCX - - fclr s1 - unop - fclr t1 - ble N, $L999 - - fclr s2 - sra N, 2, I - fclr s3 - ble I, $L15 - - LD a0, 0 * SIZE(X) - fclr t2 - LD a1, 1 * SIZE(X) - SXADDQ INCX, X, X - - LD a2, 0 * SIZE(X) - fclr t3 - LD a3, 1 * SIZE(X) - SXADDQ INCX, X, X - - LD a4, 0 * SIZE(X) - LD a5, 1 * SIZE(X) - SXADDQ INCX, X, X - lda I, -1(I) - - ble I, $L13 - .align 4 - -$L12: - ADD s0, t0, s0 - ldl $31, PREFETCHSIZE * SIZE(X) - fmov a0, t0 - lda I, -1(I) - - ADD s1, t1, s1 - LD a6, 0 * SIZE(X) - fmov a1, t1 - unop - - ADD s2, t2, s2 - LD a7, 1 * SIZE(X) - fmov a2, t2 - SXADDQ INCX, X, X - - ADD s3, t3, s3 - LD a0, 0 * SIZE(X) - fmov a3, t3 - unop - - ADD s0, t0, s0 - LD a1, 1 * SIZE(X) - fmov a4, t0 - SXADDQ INCX, X, X - - ADD s1, t1, s1 - LD a2, 0 * SIZE(X) - fmov a5, t1 - unop - - ADD s2, t2, s2 - LD a3, 1 * SIZE(X) - fmov a6, t2 - SXADDQ INCX, X, X - - ADD s3, t3, s3 - LD a4, 0 * SIZE(X) - fmov a7, t3 - unop - - LD a5, 1 * SIZE(X) - unop - SXADDQ INCX, X, X - bne I, $L12 - .align 4 - -$L13: - ADD s0, t0, s0 - LD a6, 0 * SIZE(X) - fmov a0, t0 - - ADD s1, t1, s1 - LD a7, 1 * SIZE(X) - fmov a1, t1 - SXADDQ INCX, X, X - - ADD s2, t2, s2 - fmov a2, t2 - ADD s3, t3, s3 - fmov a3, t3 - - ADD s0, t0, s0 - fmov a4, t0 - ADD s1, t1, s1 - fmov a5, t1 - ADD s2, t2, s2 - fmov a6, t2 - ADD s3, t3, s3 - fmov a7, t3 - - ADD s2, t2, s2 - ADD s3, t3, s3 - - .align 4 - -$L15: - ADD s0, s2, s0 - and N, 3, I - ADD s1, s3, s1 - ble I, $L999 - .align 4 - -$L17: - ADD s0, t0, s0 - LD a0, 0 * SIZE(X) - fmov a0, t0 - lda I, -1(I) - - ADD s1, t1, s1 - LD a1, 1 * SIZE(X) - fmov a1, t1 - SXADDQ INCX, X, X - - bne I, $L17 - .align 4 - -$L999: - ADD s0, t0, s0 - ADD s1, t1, s1 - - ADD s0, s1, s0 - ret - EPILOGUE diff --git a/kernel/arm/KERNEL.ARMV5 b/kernel/arm/KERNEL.ARMV5 index e977dda3a..10808e2d9 100644 --- a/kernel/arm/KERNEL.ARMV5 +++ b/kernel/arm/KERNEL.ARMV5 @@ -35,11 +35,6 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c -SSUMKERNEL = ../arm/sum.c -DSUMKERNEL = ../arm/sum.c -CSUMKERNEL = ../arm/zsum.c -ZSUMKERNEL = ../arm/zsum.c - SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index 344a71885..960dae67b 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -1,30 +1,30 @@ include $(KERNELDIR)/KERNEL.ARMV5 -SAMAXKERNEL = amax_vfp.S -DAMAXKERNEL = amax_vfp.S -#CAMAXKERNEL = amax_vfp.S -#ZAMAXKERNEL = amax_vfp.S +SAMAXKERNEL = iamax_vfp.S +DAMAXKERNEL = iamax_vfp.S +CAMAXKERNEL = iamax_vfp.S +ZAMAXKERNEL = iamax_vfp.S -SAMINKERNEL = amax_vfp.S -DAMINKERNEL = amax_vfp.S -#CAMINKERNEL = amax_vfp.S -#ZAMINKERNEL = amax_vfp.S +SAMINKERNEL = iamax_vfp.S +DAMINKERNEL = iamax_vfp.S +CAMINKERNEL = iamax_vfp.S +ZAMINKERNEL = iamax_vfp.S -SMAXKERNEL = amax_vfp.S -DMAXKERNEL = amax_vfp.S +SMAXKERNEL = iamax_vfp.S +DMAXKERNEL = iamax_vfp.S -SMINKERNEL = amax_vfp.S -DMINKERNEL = amax_vfp.S +SMINKERNEL = iamax_vfp.S +DMINKERNEL = iamax_vfp.S ISAMAXKERNEL = iamax_vfp.S IDAMAXKERNEL = iamax_vfp.S -#ICAMAXKERNEL = iamax_vfp.S -#IZAMAXKERNEL = iamax_vfp.S +ICAMAXKERNEL = iamax_vfp.S +IZAMAXKERNEL = iamax_vfp.S ISAMINKERNEL = iamax_vfp.S IDAMINKERNEL = iamax_vfp.S -#ICAMINKERNEL = iamax_vfp.S -#IZAMINKERNEL = iamax_vfp.S +ICAMINKERNEL = iamax_vfp.S +IZAMINKERNEL = iamax_vfp.S ISMAXKERNEL = iamax_vfp.S IDMAXKERNEL = iamax_vfp.S @@ -37,9 +37,6 @@ DASUMKERNEL = asum_vfp.S CASUMKERNEL = asum_vfp.S ZASUMKERNEL = asum_vfp.S -SSUMKERNEL = sum_vfp.S -DSUMKERNEL = sum_vfp.S - SAXPYKERNEL = axpy_vfp.S DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S diff --git a/kernel/arm/amax_vfp.S b/kernel/arm/amax_vfp.S deleted file mode 100644 index d3770ea1e..000000000 --- a/kernel/arm/amax_vfp.S +++ /dev/null @@ -1,445 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* 2013/11/14 Saar -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define N r0 -#define X r1 -#define INC_X r2 - -#define I r12 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -#if defined(USE_ABS) - -#if defined(DOUBLE) - -#define VABS(x0,x1) vabs.f64 x0, x1 - -#else - -#define VABS(x0,x1) vabs.f32 x0, x1 - -#endif - -#else - -#define VABS(x0,x1) nop - -#endif - -/*****************************************************************************************/ - -#if defined(USE_MIN) - -#define MOVCOND movlt - -#if defined(DOUBLE) - -#define VMOVCOND vmovlt.f64 - -#else - -#define VMOVCOND vmovlt.f32 - -#endif - -#else - -#define MOVCOND movgt - -#if defined(DOUBLE) - -#define VMOVCOND vmovgt.f64 - -#else - -#define VMOVCOND vmovgt.f32 - -#endif - - -#endif - - -/*****************************************************************************************/ - - - -#if !defined(COMPLEX) - -#if defined(DOUBLE) - -.macro INIT_F - - vldmia.f64 X!, { d0 } - VABS( d0, d0 ) - -.endm - -.macro KERNEL_F1 - - vldmia.f64 X!, { d4 } - VABS( d4, d4 ) - vcmpe.f64 d4, d0 - vmrs APSR_nzcv, fpscr - VMOVCOND d0, d4 - -.endm - -.macro INIT_S - - vldmia.f64 X, { d0 } - VABS( d0, d0 ) - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f64 X, { d4 } - VABS( d4, d4 ) - vcmpe.f64 d4, d0 - vmrs APSR_nzcv, fpscr - VMOVCOND d0, d4 - add X, X, INC_X - -.endm - -#else - -.macro INIT_F - - vldmia.f32 X!, { s0 } - VABS( s0, s0 ) - -.endm - -.macro KERNEL_F1 - - vldmia.f32 X!, { s4 } - VABS( s4, s4 ) - vcmpe.f32 s4, s0 - vmrs APSR_nzcv, fpscr - VMOVCOND s0, s4 - -.endm - -.macro INIT_S - - vldmia.f32 X, { s0 } - VABS( s0, s0 ) - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f32 X, { s4 } - VABS( s4, s4 ) - vcmpe.f32 s4, s0 - vmrs APSR_nzcv, fpscr - VMOVCOND s0, s4 - add X, X, INC_X - -.endm - - - - -#endif - -#else - -#if defined(DOUBLE) - -.macro INIT_F - - vldmia.f64 X!, { d0 -d1 } - vabs.f64 d0, d0 - vabs.f64 d1, d1 - vadd.f64 d0 , d0, d1 -.endm - - -.macro KERNEL_F1 - - vldmia.f64 X!, { d4 - d5 } - vabs.f64 d4, d4 - vabs.f64 d5, d5 - vadd.f64 d4 , d4, d5 - vcmpe.f64 d4, d0 - vmrs APSR_nzcv, fpscr - VMOVCOND d0, d4 - -.endm - -.macro INIT_S - - vldmia.f64 X, { d0 -d1 } - vabs.f64 d0, d0 - vabs.f64 d1, d1 - vadd.f64 d0 , d0, d1 - add X, X, INC_X - -.endm - - - -.macro KERNEL_S1 - - vldmia.f64 X, { d4 - d5 } - vabs.f64 d4, d4 - vabs.f64 d5, d5 - vadd.f64 d4 , d4, d5 - vcmpe.f64 d4, d0 - vmrs APSR_nzcv, fpscr - VMOVCOND d0, d4 - add X, X, INC_X - -.endm - -#else - -.macro INIT_F - - vldmia.f32 X!, { s0 -s1 } - vabs.f32 s0, s0 - vabs.f32 s1, s1 - vadd.f32 s0 , s0, s1 - -.endm - - -.macro KERNEL_F1 - - vldmia.f32 X!, { s4 - s5 } - vabs.f32 s4, s4 - vabs.f32 s5, s5 - vadd.f32 s4 , s4, s5 - vcmpe.f32 s4, s0 - vmrs APSR_nzcv, fpscr - VMOVCOND s0, s4 - -.endm - -.macro INIT_S - - vldmia.f32 X, { s0 -s1 } - vabs.f32 s0, s0 - vabs.f32 s1, s1 - vadd.f32 s0 , s0, s1 - add X, X, INC_X - -.endm - - - -.macro KERNEL_S1 - - vldmia.f32 X, { s4 - s5 } - vabs.f32 s4, s4 - vabs.f32 s5, s5 - vadd.f32 s4 , s4, s5 - vcmpe.f32 s4, s0 - vmrs APSR_nzcv, fpscr - VMOVCOND s0, s4 - add X, X, INC_X - -.endm - - - - -#endif - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - movs r12, #0 // clear floating point register - vmov s0, r12 -#if defined(DOUBLE) - vcvt.f64.f32 d0, s0 -#endif - - - cmp N, #0 - ble amax_kernel_L999 - - cmp INC_X, #0 - beq amax_kernel_L999 - - - cmp INC_X, #1 - bne amax_kernel_S_BEGIN - - -amax_kernel_F_BEGIN: - - INIT_F - - subs N, N , #1 - ble amax_kernel_L999 - - asrs I, N, #2 // I = N / 4 - ble amax_kernel_F1 - - .align 5 - -amax_kernel_F4: - - pld [ X, #X_PRE ] - KERNEL_F1 - KERNEL_F1 -#if defined(COMPLEX) && defined(DOUBLE) - pld [ X, #X_PRE ] -#endif - KERNEL_F1 - KERNEL_F1 - - subs I, I, #1 - ble amax_kernel_F1 - - -#if defined(COMPLEX) || defined(DOUBLE) - pld [ X, #X_PRE ] -#endif - KERNEL_F1 - KERNEL_F1 -#if defined(COMPLEX) && defined(DOUBLE) - pld [ X, #X_PRE ] -#endif - KERNEL_F1 - KERNEL_F1 - - subs I, I, #1 - bne amax_kernel_F4 - -amax_kernel_F1: - - ands I, N, #3 - ble amax_kernel_L999 - -amax_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne amax_kernel_F10 - - b amax_kernel_L999 - -amax_kernel_S_BEGIN: - -#if defined(COMPLEX) - -#if defined(DOUBLE) - lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 -#else - lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 -#endif - -#else - -#if defined(DOUBLE) - lsl INC_X, INC_X, #3 // INC_X * SIZE -#else - lsl INC_X, INC_X, #2 // INC_X * SIZE -#endif - -#endif - - INIT_S - - subs N, N , #1 - ble amax_kernel_L999 - - asrs I, N, #2 // I = N / 4 - ble amax_kernel_S1 - - .align 5 - -amax_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne amax_kernel_S4 - -amax_kernel_S1: - - ands I, N, #3 - ble amax_kernel_L999 - -amax_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne amax_kernel_S10 - - -amax_kernel_L999: -#if !defined(__ARM_PCS_VFP) -#if defined(DOUBLE) - vmov r0, r1, d0 -#else - vmov r0, s0 -#endif -#endif - bx lr - - EPILOGUE - diff --git a/kernel/arm/imin.c b/kernel/arm/imin.c index ffc65226e..598cba387 100644 --- a/kernel/arm/imin.c +++ b/kernel/arm/imin.c @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] < minf ) + if( x[ix] > minf ) { min = i; minf = x[ix]; diff --git a/kernel/arm/sum.c b/kernel/arm/sum.c deleted file mode 100644 index 7b78ec61a..000000000 --- a/kernel/arm/sum.c +++ /dev/null @@ -1,51 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* trivial copy of asum.c with the ABS() removed * -**************************************************************************************/ - - -#include "common.h" -#include - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += x[i]; - i += inc_x; - } - return(sumf); -} - - diff --git a/kernel/arm/sum_vfp.S b/kernel/arm/sum_vfp.S deleted file mode 100644 index d33d99ed3..000000000 --- a/kernel/arm/sum_vfp.S +++ /dev/null @@ -1,425 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * -**************************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACKSIZE 256 - -#define N r0 -#define X r1 -#define INC_X r2 - - -#define I r12 - -#define X_PRE 512 - -/************************************************************************************** -* Macro definitions -**************************************************************************************/ - -#if !defined(COMPLEX) - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - vldmia.f64 X!, { d4 - d5 } - vadd.f64 d0 , d0, d4 - vldmia.f64 X!, { d6 - d7 } - vadd.f64 d1 , d1, d5 - vadd.f64 d0 , d0, d6 - vadd.f64 d1 , d1, d7 - -.endm - -.macro KERNEL_F1 - - vldmia.f64 X!, { d4 } - vadd.f64 d0 , d0, d4 - -.endm - - -.macro KERNEL_S4 - - vldmia.f64 X, { d4 } - vadd.f64 d0 , d0, d4 - add X, X, INC_X - - vldmia.f64 X, { d4 } - vadd.f64 d0 , d0, d4 - add X, X, INC_X - - vldmia.f64 X, { d4 } - vadd.f64 d0 , d0, d4 - add X, X, INC_X - - vldmia.f64 X, { d4 } - vadd.f64 d0 , d0, d4 - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f64 X, { d4 } - vadd.f64 d0 , d0, d4 - add X, X, INC_X - -.endm - -#else - -.macro KERNEL_F4 - - vldmia.f32 X!, { s4 - s5 } - vadd.f32 s0 , s0, s4 - vldmia.f32 X!, { s6 - s7 } - vadd.f32 s1 , s1, s5 - vadd.f32 s0 , s0, s6 - vadd.f32 s1 , s1, s7 - -.endm - -.macro KERNEL_F1 - - vldmia.f32 X!, { s4 } - vadd.f32 s0 , s0, s4 - -.endm - - -.macro KERNEL_S4 - - vldmia.f32 X, { s4 } - vadd.f32 s0 , s0, s4 - add X, X, INC_X - - vldmia.f32 X, { s4 } - vadd.f32 s0 , s0, s4 - add X, X, INC_X - - vldmia.f32 X, { s4 } - vadd.f32 s0 , s0, s4 - add X, X, INC_X - - vldmia.f32 X, { s4 } - vadd.f32 s0 , s0, s4 - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f32 X, { s4 } - vadd.f32 s0 , s0, s4 - add X, X, INC_X - -.endm - - -#endif - -#else - -#if defined(DOUBLE) - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - vldmia.f64 X!, { d4 - d5 } - vadd.f64 d0 , d0, d4 - vldmia.f64 X!, { d6 - d7 } - vadd.f64 d1 , d1, d5 - vadd.f64 d0 , d0, d6 - vadd.f64 d1 , d1, d7 - - pld [ X, #X_PRE ] - vldmia.f64 X!, { d4 - d5 } - vadd.f64 d0 , d0, d4 - vldmia.f64 X!, { d6 - d7 } - vadd.f64 d1 , d1, d5 - vadd.f64 d0 , d0, d6 - vadd.f64 d1 , d1, d7 - - -.endm - -.macro KERNEL_F1 - - vldmia.f64 X!, { d4 } - vadd.f64 d0 , d0, d4 - - vldmia.f64 X!, { d4 } - vadd.f64 d0 , d0, d4 - - -.endm - - -.macro KERNEL_S4 - - vldmia.f64 X, { d4 -d5 } - vadd.f64 d0 , d0, d4 - vadd.f64 d0 , d0, d5 - add X, X, INC_X - - vldmia.f64 X, { d4 -d5 } - vadd.f64 d0 , d0, d4 - vadd.f64 d0 , d0, d5 - add X, X, INC_X - - vldmia.f64 X, { d4 -d5 } - vadd.f64 d0 , d0, d4 - vadd.f64 d0 , d0, d5 - add X, X, INC_X - - vldmia.f64 X, { d4 -d5 } - vadd.f64 d0 , d0, d4 - vadd.f64 d0 , d0, d5 - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f64 X, { d4 -d5 } - vadd.f64 d0 , d0, d4 - vadd.f64 d0 , d0, d5 - add X, X, INC_X - -.endm - -#else - -.macro KERNEL_F4 - - pld [ X, #X_PRE ] - vldmia.f32 X!, { s4 - s5 } - vadd.f32 s0 , s0, s4 - vldmia.f32 X!, { s6 - s7 } - vadd.f32 s1 , s1, s5 - vadd.f32 s0 , s0, s6 - vadd.f32 s1 , s1, s7 - - vldmia.f32 X!, { s4 - s5 } - vadd.f32 s0 , s0, s4 - vldmia.f32 X!, { s6 - s7 } - vadd.f32 s1 , s1, s5 - vadd.f32 s0 , s0, s6 - vadd.f32 s1 , s1, s7 - - -.endm - -.macro KERNEL_F1 - - vldmia.f32 X!, { s4 } - vadd.f32 s0 , s0, s4 - - vldmia.f32 X!, { s4 } - vadd.f32 s0 , s0, s4 - -.endm - - -.macro KERNEL_S4 - - vldmia.f32 X, { s4 -s5 } - vadd.f32 s0 , s0, s4 - vadd.f32 s0 , s0, s5 - add X, X, INC_X - - vldmia.f32 X, { s4 -s5 } - vadd.f32 s0 , s0, s4 - vadd.f32 s0 , s0, s5 - add X, X, INC_X - - vldmia.f32 X, { s4 -s5 } - vadd.f32 s0 , s0, s4 - vadd.f32 s0 , s0, s5 - add X, X, INC_X - - vldmia.f32 X, { s4 -s5 } - vadd.f32 s0 , s0, s4 - vadd.f32 s0 , s0, s5 - add X, X, INC_X - -.endm - - -.macro KERNEL_S1 - - vldmia.f32 X, { s4 -s5 } - vadd.f32 s0 , s0, s4 - vadd.f32 s0 , s0, s5 - add X, X, INC_X - -.endm - -#endif - -#endif - -/************************************************************************************** -* End of macro definitions -**************************************************************************************/ - - PROLOGUE - - .align 5 - - movs r12, #0 // clear floating point register - vmov s0, r12 - vmov s1, r12 -#if defined(DOUBLE) - vcvt.f64.f32 d0, s0 - vcvt.f64.f32 d1, s1 -#endif - - cmp N, #0 - ble asum_kernel_L999 - - cmp INC_X, #0 - beq asum_kernel_L999 - - cmp INC_X, #1 - bne asum_kernel_S_BEGIN - - -asum_kernel_F_BEGIN: - - asrs I, N, #2 // I = N / 4 - ble asum_kernel_F1 - - .align 5 - -asum_kernel_F4: - -#if !defined(DOUBLE) && !defined(COMPLEX) - pld [ X, #X_PRE ] -#endif - KERNEL_F4 - - subs I, I, #1 - ble asum_kernel_F1 - - KERNEL_F4 - - subs I, I, #1 - bne asum_kernel_F4 - -asum_kernel_F1: - - ands I, N, #3 - ble asum_kernel_L999 - -asum_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne asum_kernel_F10 - - b asum_kernel_L999 - -asum_kernel_S_BEGIN: - -#if defined(COMPLEX) - -#if defined(DOUBLE) - lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 -#else - lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 -#endif - -#else - -#if defined(DOUBLE) - lsl INC_X, INC_X, #3 // INC_X * SIZE -#else - lsl INC_X, INC_X, #2 // INC_X * SIZE -#endif - -#endif - - asrs I, N, #2 // I = N / 4 - ble asum_kernel_S1 - - .align 5 - -asum_kernel_S4: - - KERNEL_S4 - - subs I, I, #1 - bne asum_kernel_S4 - -asum_kernel_S1: - - ands I, N, #3 - ble asum_kernel_L999 - -asum_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne asum_kernel_S10 - - -asum_kernel_L999: - - -#if defined(DOUBLE) - vadd.f64 d0 , d0, d1 // set return value -#else - vadd.f32 s0 , s0, s1 // set return value -#endif - -#if !defined(__ARM_PCS_VFP) -#if !defined(DOUBLE) - vmov r0, s0 -#else - vmov r0, r1, d0 -#endif -#endif - - bx lr - - EPILOGUE - diff --git a/kernel/arm/zsum.c b/kernel/arm/zsum.c deleted file mode 100644 index cd24f9995..000000000 --- a/kernel/arm/zsum.c +++ /dev/null @@ -1,57 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* trivial copy of zasum.c with the ABS() removed * -**************************************************************************************/ - - -#include "common.h" -#include - -#define CSUM1(x,i) x[i]+x[i+1] - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CSUM1(x,i); - i += inc_x2; - } - return(sumf); -} - - diff --git a/kernel/arm64/KERNEL.TSV110 b/kernel/arm64/KERNEL.TSV110 deleted file mode 100644 index 04d6940d7..000000000 --- a/kernel/arm64/KERNEL.TSV110 +++ /dev/null @@ -1,175 +0,0 @@ -SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = ../arm/amin.c -CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = ../arm/zamin.c - -SMAXKERNEL = ../arm/max.c -DMAXKERNEL = ../arm/max.c - -SMINKERNEL = ../arm/min.c -DMINKERNEL = ../arm/min.c - -ISAMINKERNEL = ../arm/iamin.c -IDAMINKERNEL = ../arm/iamin.c -ICAMINKERNEL = ../arm/izamin.c -IZAMINKERNEL = ../arm/izamin.c - -ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = ../arm/imax.c - -ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = ../arm/imin.c - -STRMMKERNEL = ../generic/trmmkernel_4x4.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -SAMAXKERNEL = amax.S -DAMAXKERNEL = amax.S -CAMAXKERNEL = zamax.S -ZAMAXKERNEL = zamax.S - -ISAMAXKERNEL = iamax.S -IDAMAXKERNEL = iamax.S -ICAMAXKERNEL = izamax.S -IZAMAXKERNEL = izamax.S - -SASUMKERNEL = asum.S -DASUMKERNEL = asum.S -CASUMKERNEL = casum.S -ZASUMKERNEL = zasum.S - -SAXPYKERNEL = axpy.S -DAXPYKERNEL = axpy.S -CAXPYKERNEL = zaxpy.S -ZAXPYKERNEL = zaxpy.S - -SCOPYKERNEL = copy.S -DCOPYKERNEL = copy.S -CCOPYKERNEL = copy.S -ZCOPYKERNEL = copy.S - -SDOTKERNEL = dot.S -DDOTKERNEL = dot.S -CDOTKERNEL = zdot.S -ZDOTKERNEL = zdot.S -DSDOTKERNEL = dot.S - -SNRM2KERNEL = nrm2.S -DNRM2KERNEL = nrm2.S -CNRM2KERNEL = znrm2.S -ZNRM2KERNEL = znrm2.S - -SROTKERNEL = rot.S -DROTKERNEL = rot.S -CROTKERNEL = zrot.S -ZROTKERNEL = zrot.S - -SSCALKERNEL = scal.S -DSCALKERNEL = scal.S -CSCALKERNEL = zscal.S -ZSCALKERNEL = zscal.S - -SSWAPKERNEL = swap.S -DSWAPKERNEL = swap.S -CSWAPKERNEL = swap.S -ZSWAPKERNEL = swap.S - -SGEMVNKERNEL = gemv_n.S -DGEMVNKERNEL = gemv_n.S -CGEMVNKERNEL = zgemv_n.S -ZGEMVNKERNEL = zgemv_n.S - -SGEMVTKERNEL = gemv_t.S -DGEMVTKERNEL = gemv_t.S -CGEMVTKERNEL = zgemv_t.S -ZGEMVTKERNEL = zgemv_t.S - -SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S -ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) -SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c -SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c -SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S -DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S - -ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) - -ifeq ($(DGEMM_UNROLL_M), 8) -DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S -DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S -else -DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c -DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c -endif - -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif - -ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S -DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S -else -DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c -DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c -endif - -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S -ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) -CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c -CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c -CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S -ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) -ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c -ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -endif -ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - diff --git a/kernel/arm64/csum.S b/kernel/arm64/csum.S deleted file mode 100644 index 90746bc39..000000000 --- a/kernel/arm64/csum.S +++ /dev/null @@ -1,164 +0,0 @@ -/******************************************************************************* -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N x0 /* vector length */ -#define X x1 /* X vector address */ -#define INC_X x2 /* X stride */ -#define I x5 /* loop variable */ - -/******************************************************************************* -* Macro definitions -*******************************************************************************/ - -#define REG0 wzr -#define SUMF s0 -#define TMPF s1 -#define TMPVF {v1.s}[0] -#define SZ 4 - -/******************************************************************************/ - -.macro KERNEL_F1 - ld1 {v1.2s}, [X], #8 - ext v2.8b, v1.8b, v1.8b, #4 - fadd TMPF, TMPF, s2 - fadd SUMF, SUMF, TMPF -.endm - -.macro KERNEL_F8 - ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] - add X, X, #64 - - PRFM PLDL1KEEP, [X, #1024] - - fadd v1.4s, v1.4s, v2.4s - fadd v3.4s, v3.4s, v4.4s - fadd v0.4s, v0.4s, v1.4s - fadd v0.4s, v0.4s, v3.4s -.endm - -.macro KERNEL_F8_FINALIZE - ext v1.16b, v0.16b, v0.16b, #8 - fadd v0.2s, v0.2s, v1.2s - faddp SUMF, v0.2s -.endm - -.macro INIT_S - lsl INC_X, INC_X, #3 -.endm - -.macro KERNEL_S1 - ld1 {v1.2s}, [X], INC_X - ext v2.8b, v1.8b, v1.8b, #4 - fadd TMPF, TMPF, s2 - fadd SUMF, SUMF, TMPF - -.endm - -/******************************************************************************* -* End of macro definitions -*******************************************************************************/ - - PROLOGUE - - fmov SUMF, REG0 - fmov s1, SUMF - - cmp N, xzr - ble .Lcsum_kernel_L999 - cmp INC_X, xzr - ble .Lcsum_kernel_L999 - - cmp INC_X, #1 - bne .Lcsum_kernel_S_BEGIN - -.Lcsum_kernel_F_BEGIN: - - asr I, N, #3 - cmp I, xzr - beq .Lcsum_kernel_F1 - -.Lcsum_kernel_F8: - - KERNEL_F8 - - subs I, I, #1 - bne .Lcsum_kernel_F8 - - KERNEL_F8_FINALIZE - -.Lcsum_kernel_F1: - - ands I, N, #7 - ble .Lcsum_kernel_L999 - -.Lcsum_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne .Lcsum_kernel_F10 - -.Lcsum_kernel_L999: - ret - -.Lcsum_kernel_S_BEGIN: - - INIT_S - - asr I, N, #2 - cmp I, xzr - ble .Lcsum_kernel_S1 - -.Lcsum_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne .Lcsum_kernel_S4 - -.Lcsum_kernel_S1: - - ands I, N, #3 - ble .Lcsum_kernel_L999 - -.Lcsum_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne .Lcsum_kernel_S10 - - ret - - EPILOGUE diff --git a/kernel/arm64/sum.S b/kernel/arm64/sum.S deleted file mode 100644 index 16d0dc4e4..000000000 --- a/kernel/arm64/sum.S +++ /dev/null @@ -1,186 +0,0 @@ -/******************************************************************************* -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N x0 /* vector length */ -#define X x1 /* X vector address */ -#define INC_X x2 /* X stride */ -#define I x5 /* loop variable */ - -/******************************************************************************* -* Macro definitions -*******************************************************************************/ - -#if !defined(DOUBLE) -#define REG0 wzr -#define SUMF s0 -#define TMPF s1 -#define TMPVF {v1.s}[0] -#define SZ 4 -#else -#define REG0 xzr -#define SUMF d0 -#define TMPF d1 -#define TMPVF {v1.d}[0] -#define SZ 8 -#endif - -/******************************************************************************/ - -.macro KERNEL_F1 - ldr TMPF, [X], #SZ - fadd SUMF, SUMF, TMPF -.endm - -.macro KERNEL_F8 -#if !defined(DOUBLE) - ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] - fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] - fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] - PRFM PLDL1KEEP, [X, #1024] -#else // DOUBLE - ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] - add X, X, #64 - - PRFM PLDL1KEEP, [X, #1024] - - fadd v2.2d, v2.2d, v3.2d - fadd v4.2d, v4.2d, v5.2d - fadd v0.2d, v0.2d, v2.2d - fadd v0.2d, v0.2d, v4.2d -#endif -.endm - -.macro KERNEL_F8_FINALIZE -#if !defined(DOUBLE) - ext v1.16b, v0.16b, v0.16b, #8 - fadd v0.2s, v0.2s, v1.2s - faddp SUMF, v0.2s -#else - faddp SUMF, v0.2d -#endif -.endm - -.macro INIT_S -#if !defined(DOUBLE) - lsl INC_X, INC_X, #2 -#else - lsl INC_X, INC_X, #3 -#endif -.endm - -.macro KERNEL_S1 - ld1 TMPVF, [X], INC_X - fadd SUMF, SUMF, TMPF -.endm - -/******************************************************************************* -* End of macro definitions -*******************************************************************************/ - - PROLOGUE - - fmov SUMF, REG0 -#if !defined(DOUBLE) - fmov s1, SUMF -#else - fmov d1, SUMF -#endif - - cmp N, xzr - ble .Lsum_kernel_L999 - cmp INC_X, xzr - ble .Lsum_kernel_L999 - - cmp INC_X, #1 - bne .Lsum_kernel_S_BEGIN - -.Lsum_kernel_F_BEGIN: - - asr I, N, #3 - cmp I, xzr - beq .Lsum_kernel_F1 - -.Lsum_kernel_F8: - - KERNEL_F8 - - subs I, I, #1 - bne .Lsum_kernel_F8 - - KERNEL_F8_FINALIZE - -.Lsum_kernel_F1: - - ands I, N, #7 - ble .Lsum_kernel_L999 - -.Lsum_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne .Lsum_kernel_F10 - -.Lsum_kernel_L999: - ret - -.Lsum_kernel_S_BEGIN: - - INIT_S - - asr I, N, #2 - cmp I, xzr - ble .Lsum_kernel_S1 - -.Lsum_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne .Lsum_kernel_S4 - -.Lsum_kernel_S1: - - ands I, N, #3 - ble .Lsum_kernel_L999 - -.Lsum_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne .Lsum_kernel_S10 - - ret - - EPILOGUE diff --git a/kernel/arm64/zsum.S b/kernel/arm64/zsum.S deleted file mode 100644 index 67ea3cb4d..000000000 --- a/kernel/arm64/zsum.S +++ /dev/null @@ -1,158 +0,0 @@ -/******************************************************************************* -Copyright (c) 2015, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*******************************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N x0 /* vector length */ -#define X x1 /* X vector address */ -#define INC_X x2 /* X stride */ -#define I x5 /* loop variable */ - -/******************************************************************************* -* Macro definitions -*******************************************************************************/ - -#define REG0 xzr -#define SUMF d0 -#define TMPF d1 -#define TMPVF {v1.d}[0] -#define SZ 8 - -/******************************************************************************/ - -.macro KERNEL_F1 - ld1 {v1.2d}, [X], #16 - faddp TMPF, v1.2d - fadd SUMF, SUMF, TMPF -.endm - -.macro KERNEL_F4 - ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 - - fadd v1.2d, v1.2d, v2.2d - fadd v3.2d, v3.2d, v4.2d - - fadd v0.2d, v0.2d, v1.2d - fadd v0.2d, v0.2d, v3.2d - - PRFM PLDL1KEEP, [X, #1024] -.endm - -.macro KERNEL_F4_FINALIZE - faddp SUMF, v0.2d -.endm - -.macro INIT_S - lsl INC_X, INC_X, #4 -.endm - -.macro KERNEL_S1 - ld1 {v1.2d}, [X], INC_X - faddp TMPF, v1.2d - fadd SUMF, SUMF, TMPF -.endm - -/******************************************************************************* -* End of macro definitions -*******************************************************************************/ - - PROLOGUE - - fmov SUMF, REG0 - - cmp N, xzr - ble .Lzsum_kernel_L999 - cmp INC_X, xzr - ble .Lzsum_kernel_L999 - - cmp INC_X, #1 - bne .Lzsum_kernel_S_BEGIN - -.Lzsum_kernel_F_BEGIN: - - asr I, N, #2 - cmp I, xzr - beq .Lzsum_kernel_F1 - -.Lzsum_kernel_F4: - - KERNEL_F4 - - subs I, I, #1 - bne .Lzsum_kernel_F4 - - KERNEL_F4_FINALIZE - -.Lzsum_kernel_F1: - - ands I, N, #3 - ble .Lzsum_kernel_L999 - -.Lzsum_kernel_F10: - - KERNEL_F1 - - subs I, I, #1 - bne .Lzsum_kernel_F10 - -.Lzsum_kernel_L999: - ret - -.Lzsum_kernel_S_BEGIN: - - INIT_S - - asr I, N, #2 - cmp I, xzr - ble .Lzsum_kernel_S1 - -.Lzsum_kernel_S4: - - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - KERNEL_S1 - - subs I, I, #1 - bne .Lzsum_kernel_S4 - -.Lzsum_kernel_S1: - - ands I, N, #3 - ble .Lzsum_kernel_L999 - -.Lzsum_kernel_S10: - - KERNEL_S1 - - subs I, I, #1 - bne .Lzsum_kernel_S10 - - ret - - EPILOGUE diff --git a/kernel/ia64/KERNEL b/kernel/ia64/KERNEL index 870aac473..10a7e61e2 100644 --- a/kernel/ia64/KERNEL +++ b/kernel/ia64/KERNEL @@ -60,10 +60,6 @@ CASUMKERNEL = asum.S ZASUMKERNEL = asum.S XASUMKERNEL = asum.S -CSUMKERNEL = sum.S -ZSUMKERNEL = sum.S -XSUMKERNEL = sum.S - CNRM2KERNEL = nrm2.S ZNRM2KERNEL = nrm2.S XNRM2KERNEL = nrm2.S diff --git a/kernel/ia64/sum.S b/kernel/ia64/sum.S deleted file mode 100644 index 561d5d771..000000000 --- a/kernel/ia64/sum.S +++ /dev/null @@ -1,358 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* Copyright 2019, The OpenBLAS project */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#ifdef XDOUBLE -#define PREFETCH_SIZE ( 8 * 16 + 4) -#elif defined(DOUBLE) -#define PREFETCH_SIZE (16 * 16 + 8) -#else -#define PREFETCH_SIZE (32 * 16 + 16) -#endif - -#ifndef COMPLEX -#define COMPADD 0 -#define STRIDE INCX -#else -#define COMPADD 1 -#define STRIDE SIZE -#endif - -#define PRE1 r2 - -#define I r17 -#define J r18 -#define INCX16 r21 - -#define PR r30 -#define ARLC r31 - -#define N r32 -#define X r33 -#define INCX r34 - - - PROLOGUE - .prologue - PROFCODE - { .mfi - adds PRE1 = PREFETCH_SIZE * SIZE, X - mov f8 = f0 - .save ar.lc, ARLC - mov ARLC = ar.lc - } - ;; - .body -#ifdef F_INTERFACE - { .mmi - LDINT N = [N] - LDINT INCX = [INCX] - nop.i 0 - } - ;; -#ifndef USE64BITINT - { .mii - nop.m 0 - sxt4 N = N - sxt4 INCX = INCX - } - ;; -#endif -#endif - { .mmi - cmp.lt p0, p6 = r0, INCX - cmp.lt p0, p7 = r0, N - shr I = N, (4 - COMPADD) - } - { .mbb - and J = ((1 << (4 - COMPADD)) - 1), N - (p6) br.ret.sptk.many b0 - (p7) br.ret.sptk.many b0 - } - ;; - { .mfi - adds I = -1, I - mov f10 = f0 - mov PR = pr - } - { .mfi - cmp.eq p9, p0 = r0, J - mov f9 = f0 - tbit.z p0, p12 = N, 3 - COMPADD - } - ;; - { .mmi - cmp.eq p16, p0 = r0, r0 - cmp.ne p17, p0 = r0, r0 - mov ar.ec= 3 - } - { .mfi - cmp.ne p18, p0 = r0, r0 - mov f11 = f0 - shl INCX = INCX, BASE_SHIFT + COMPADD - } - ;; - { .mmi -#ifdef XDOUBLE - shladd INCX16 = INCX, (3 - COMPADD), r0 -#else - shladd INCX16 = INCX, (4 - COMPADD), r0 -#endif - cmp.ne p19, p0 = r0, r0 - mov ar.lc = I - } - { .mmb - cmp.gt p8 ,p0 = r0, I -#ifdef COMPLEX - adds INCX = - SIZE, INCX -#else - nop.m 0 -#endif - (p8) br.cond.dpnt .L55 - } - ;; - .align 32 - -.L52: - { .mmf - (p16) lfetch.nt1 [PRE1], INCX16 - (p16) LDFD f32 = [X], STRIDE - } - { .mfb - (p19) FADD f8 = f8, f71 - } - ;; - { .mmf - (p16) LDFD f35 = [X], INCX - } - { .mfb - (p19) FADD f9 = f9, f74 - } - ;; - { .mmf - (p16) LDFD f38 = [X], STRIDE - } - { .mfb - (p19) FADD f10 = f10, f77 - } - ;; - { .mmf - (p16) LDFD f41 = [X], INCX - } - { .mfb - (p19) FADD f11 = f11, f80 - } - ;; - { .mmf - (p16) LDFD f44 = [X], STRIDE - } - { .mfb - (p18) FADD f8 = f8, f34 - } - ;; - { .mmf - (p16) LDFD f47 = [X], INCX - } - { .mfb - (p18) FADD f9 = f9, f37 - } - ;; - { .mmf - (p16) LDFD f50 = [X], STRIDE - } - { .mfb - (p18) FADD f10 = f10, f40 - } - ;; - { .mmf - (p16) LDFD f53 = [X], INCX - } - { .mfb - (p18) FADD f11 = f11, f43 - } - ;; - { .mmf -#ifdef XDOUBLE - (p16) lfetch.nt1 [PRE1], INCX16 -#endif - (p16) LDFD f56 = [X], STRIDE - } - { .mfb - (p18) FADD f8 = f8, f46 - } - ;; - { .mmf - (p16) LDFD f59 = [X], INCX - } - { .mfb - (p18) FADD f9 = f9, f49 - } - ;; - { .mmf - (p16) LDFD f62 = [X], STRIDE - } - { .mfb - (p18) FADD f10 = f10, f52 - } - ;; - { .mmf - (p16) LDFD f65 = [X], INCX - } - { .mfb - (p18) FADD f11 = f11, f55 - } - ;; - { .mmf - (p16) LDFD f68 = [X], STRIDE - } - { .mfb - (p18) FADD f8 = f8, f58 - } - ;; - { .mmf - (p16) LDFD f71 = [X], INCX - } - { .mfb - (p18) FADD f9 = f9, f61 - } - ;; - { .mmf - (p16) LDFD f74 = [X], STRIDE - } - { .mfb - (p18) FADD f10 = f10, f64 - } - ;; - { .mmf - (p16) LDFD f77 = [X], INCX - } - { .mfb - (p18) FADD f11 = f11, f67 - br.ctop.sptk.few .L52 - } - ;; - FADD f8 = f8, f71 - FADD f9 = f9, f74 - FADD f10 = f10, f77 - FADD f11 = f11, f80 - .align 32 - ;; -.L55: - (p12) LDFD f32 = [X], STRIDE - (p9) br.cond.dptk .L998 - ;; - (p12) LDFD f33 = [X], INCX - ;; - (p12) LDFD f34 = [X], STRIDE - ;; - (p12) LDFD f35 = [X], INCX - tbit.z p0, p13 = N, (2 - COMPADD) - ;; - (p12) LDFD f36 = [X], STRIDE - tbit.z p0, p14 = N, (1 - COMPADD) - ;; - (p12) LDFD f37 = [X], INCX -#ifndef COMPLEX - tbit.z p0, p15 = N, 0 -#endif - ;; - (p12) LDFD f38 = [X], STRIDE - ;; - (p12) LDFD f39 = [X], INCX - ;; - (p13) LDFD f40 = [X], STRIDE - ;; - (p13) LDFD f41 = [X], INCX - ;; - (p13) LDFD f42 = [X], STRIDE - (p12) FADD f8 = f8, f32 - ;; - (p13) LDFD f43 = [X], INCX - (p12) FADD f9 = f9, f33 - ;; - (p14) LDFD f44 = [X], STRIDE - (p12) FADD f10 = f10, f34 - ;; - (p14) LDFD f45 = [X], INCX - (p12) FADD f11 = f11, f35 - ;; -#ifndef COMPLEX - (p15) LDFD f46 = [X] -#endif - (p12) FADD f8 = f8, f36 - ;; - (p12) FADD f9 = f9, f37 - (p12) FADD f10 = f10, f38 - (p12) FADD f11 = f11, f39 - ;; - (p13) FADD f8 = f8, f40 - (p13) FADD f9 = f9, f41 -#ifndef COMPLEX -#endif - (p13) FADD f10 = f10, f42 - ;; - (p13) FADD f11 = f11, f43 - (p14) FADD f8 = f8, f44 - (p14) FADD f9 = f9, f45 -#ifndef COMPLEX - (p15) FADD f10 = f10, f46 -#endif - ;; - .align 32 - -.L998: - { .mfi - FADD f8 = f8, f9 - mov ar.lc = ARLC - } - { .mmf - FADD f10 = f10, f11 - } - ;; - { .mii - mov pr = PR, -65474 - } - ;; - { .mfb - FADD f8 = f8, f10 - br.ret.sptk.many b0 - } - EPILOGUE diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 9a6e06d67..1ab193069 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -30,11 +30,6 @@ IDMAXKERNEL = ../mips/imax.c ISMINKERNEL = ../mips/imin.c IDMINKERNEL = ../mips/imin.c -SSUMKERNEL = ../mips/sum.c -DSUMKERNEL = ../mips/sum.c -CSUMKERNEL = ../mips/zsum.c -ZSUMKERNEL = ../mips/zsum.c - ifdef HAVE_MSA SASUMKERNEL = ../mips/sasum_msa.c DASUMKERNEL = ../mips/dasum_msa.c diff --git a/kernel/mips/imin.c b/kernel/mips/imin.c index bf130613b..d9b283d2d 100644 --- a/kernel/mips/imin.c +++ b/kernel/mips/imin.c @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) while(i < n) { - if( x[ix] < minf ) + if( x[ix] > minf ) { min = i; minf = x[ix]; diff --git a/kernel/mips/sum.c b/kernel/mips/sum.c deleted file mode 100644 index 8ce3812a1..000000000 --- a/kernel/mips/sum.c +++ /dev/null @@ -1,47 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - if (n <= 0 || inc_x <= 0) return(sumf); - - n *= inc_x; - while(i < n) - { - sumf += x[i]; - i += inc_x; - } - return(sumf); -} - - diff --git a/kernel/mips/zsum.c b/kernel/mips/zsum.c deleted file mode 100644 index 01f8ced7c..000000000 --- a/kernel/mips/zsum.c +++ /dev/null @@ -1,52 +0,0 @@ -/*************************************************************************** -Copyright (c) 2016, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CSUM1(x,i) x[i]+x[i+1] - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) -{ - BLASLONG i=0; - FLOAT sumf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) return(sumf); - - inc_x2 = 2 * inc_x; - - n *= inc_x2; - while(i < n) - { - sumf += CSUM1(x,i); - i += inc_x2; - } - return(sumf); -} - - diff --git a/kernel/mips64/sum.S b/kernel/mips64/sum.S deleted file mode 100644 index 261630d49..000000000 --- a/kernel/mips64/sum.S +++ /dev/null @@ -1,332 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N $4 -#define X $5 -#define INCX $6 - -#define I $2 -#define TEMP $3 - -#define a1 $f2 -#define a2 $f3 -#define a3 $f4 -#define a4 $f5 -#define a5 $f6 -#define a6 $f7 -#define a7 $f8 -#define a8 $f9 - -#define t1 $f10 -#define t2 $f11 -#define t3 $f12 -#define t4 $f13 - -#define s1 $f0 -#define s2 $f1 - - PROLOGUE - -#ifdef F_INTERFACE - LDINT N, 0(N) - LDINT INCX, 0(INCX) -#endif - - MTC $0, s1 - - MTC $0, s2 - dsll INCX, INCX, BASE_SHIFT - - blez N, .L999 - li TEMP, SIZE - - bne INCX, TEMP, .L20 - dsra I, N, 3 - - blez I, .L15 - NOP - - LD a1, 0 * SIZE(X) - LD a2, 1 * SIZE(X) - LD a3, 2 * SIZE(X) - LD a4, 3 * SIZE(X) - - LD a5, 4 * SIZE(X) - MOV t1, a1 - LD a6, 5 * SIZE(X) - MOV t2, a2 - LD a7, 6 * SIZE(X) - MOV t3, a3 - - MOV t4, a4 - daddiu I, I, -1 - - blez I, .L13 - LD a8, 7 * SIZE(X) - .align 3 - -.L12: - ADD s1, s1, t1 - LD a1, 8 * SIZE(X) - - MOV t1, a5 - daddiu I, I, -1 - - ADD s2, s2, t2 - LD a2, 9 * SIZE(X) - - MOV t2, a6 - NOP - - ADD s1, s1, t3 - LD a3, 10 * SIZE(X) - - MOV t3, a7 - NOP - - ADD s2, s2, t4 - LD a4, 11 * SIZE(X) - - MOV t4, a8 - daddiu X, X, 8 * SIZE - - ADD s1, s1, t1 - LD a5, 4 * SIZE(X) - - MOV t1, a1 - NOP - - ADD s2, s2, t2 - LD a6, 5 * SIZE(X) - - MOV t2, a2 - NOP - - ADD s1, s1, t3 - LD a7, 6 * SIZE(X) - - MOV t3, a3 - NOP - - ADD s2, s2, t4 - LD a8, 7 * SIZE(X) - - bgtz I, .L12 - MOV t4, a4 - .align 3 - -.L13: - ADD s1, s1, t1 - daddiu X, X, 8 * SIZE - - MOV t1, a5 - NOP - - ADD s2, s2, t2 - MOV t2, a6 - - ADD s1, s1, t3 - MOV t3, a7 - - ADD s2, s2, t4 - MOV t4, a8 - - ADD s1, s1, t1 - ADD s2, s2, t2 - ADD s1, s1, t3 - ADD s2, s2, t4 - .align 3 - -.L15: - andi I, N, 7 - - blez I, .L999 - NOP - .align 3 - -.L16: - LD a1, 0 * SIZE(X) - daddiu I, I, -1 - - MOV t1, a1 - - ADD s1, s1, t1 - - bgtz I, .L16 - daddiu X, X, SIZE - - j .L999 - NOP - .align 3 - -.L20: - blez I, .L25 - NOP - - LD a1, 0 * SIZE(X) - daddu X, X, INCX - - LD a2, 0 * SIZE(X) - daddu X, X, INCX - - LD a3, 0 * SIZE(X) - daddu X, X, INCX - - LD a4, 0 * SIZE(X) - daddu X, X, INCX - - LD a5, 0 * SIZE(X) - daddu X, X, INCX - - LD a6, 0 * SIZE(X) - daddu X, X, INCX - - MOV t1, a1 - LD a7, 0 * SIZE(X) - - MOV t2, a2 - daddu X, X, INCX - - MOV t3, a3 - LD a8, 0 * SIZE(X) - - MOV t4, a4 - daddiu I, I, -1 - - blez I, .L24 - daddu X, X, INCX - .align 3 - -.L23: - ADD s1, s1, t1 - LD a1, 0 * SIZE(X) - - MOV t1, a5 - daddu X, X, INCX - - ADD s2, s2, t2 - LD a2, 0 * SIZE(X) - - MOV t2, a6 - daddu X, X, INCX - - ADD s1, s1, t3 - LD a3, 0 * SIZE(X) - - MOV t3, a7 - daddu X, X, INCX - - ADD s2, s2, t4 - LD a4, 0 * SIZE(X) - - MOV t4, a8 - daddu X, X, INCX - - ADD s1, s1, t1 - LD a5, 0 * SIZE(X) - - MOV t1, a1 - daddu X, X, INCX - - ADD s2, s2, t2 - LD a6, 0 * SIZE(X) - - MOV t2, a2 - daddu X, X, INCX - - ADD s1, s1, t3 - LD a7, 0 * SIZE(X) - - MOV t3, a3 - daddu X, X, INCX - - ADD s2, s2, t4 - LD a8, 0 * SIZE(X) - - MOV t4, a4 - daddiu I, I, -1 - - bgtz I, .L23 - daddu X, X, INCX - .align 3 - -.L24: - ADD s1, s1, t1 - MOV t1, a5 - - ADD s2, s2, t2 - MOV t2, a6 - - ADD s1, s1, t3 - MOV t3, a7 - - ADD s2, s2, t4 - MOV t4, a8 - - ADD s1, s1, t1 - ADD s2, s2, t2 - ADD s1, s1, t3 - ADD s2, s2, t4 - .align 3 - -.L25: - andi I, N, 7 - - blez I, .L999 - NOP - .align 3 - -.L26: - LD a1, 0 * SIZE(X) - daddiu I, I, -1 - - MOV t1, a1 - daddu X, X, INCX - - bgtz I, .L26 - ADD s1, s1, t1 - .align 3 - -.L999: - j $31 - ADD s1, s1, s2 - - EPILOGUE diff --git a/kernel/mips64/zsum.S b/kernel/mips64/zsum.S deleted file mode 100644 index 129b97900..000000000 --- a/kernel/mips64/zsum.S +++ /dev/null @@ -1,204 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N $4 -#define X $5 -#define INCX $6 - -#define I $2 -#define TEMP $3 - -#define a1 $f2 -#define a2 $f3 -#define a3 $f4 -#define a4 $f5 -#define a5 $f6 -#define a6 $f7 -#define a7 $f8 -#define a8 $f9 - -#define t1 $f10 -#define t2 $f11 -#define t3 $f12 -#define t4 $f13 - -#define s1 $f0 -#define s2 $f1 - - PROLOGUE - -#ifdef F_INTERFACE - LDINT N, 0(N) - LDINT INCX, 0(INCX) -#endif - - MTC $0, s1 - - MTC $0, s2 - dsll INCX, INCX, ZBASE_SHIFT - - blez N, .L999 - dsra I, N, 2 - - blez I, .L25 - NOP - - LD a1, 0 * SIZE(X) - LD a2, 1 * SIZE(X) - daddu X, X, INCX - - LD a3, 0 * SIZE(X) - LD a4, 1 * SIZE(X) - daddu X, X, INCX - - LD a5, 0 * SIZE(X) - LD a6, 1 * SIZE(X) - daddu X, X, INCX - - MOV t1, a1 - MOV t2, a2 - - LD a7, 0 * SIZE(X) - LD a8, 1 * SIZE(X) - - MOV t3, a3 - MOV t4, a4 - daddiu I, I, -1 - - blez I, .L24 - daddu X, X, INCX - .align 3 - -.L23: - ADD s1, s1, t1 - LD a1, 0 * SIZE(X) - - MOV t1, a5 - daddiu I, I, -1 - - ADD s2, s2, t2 - LD a2, 1 * SIZE(X) - - MOV t2, a6 - daddu X, X, INCX - - ADD s1, s1, t3 - LD a3, 0 * SIZE(X) - - MOV t3, a7 - NOP - - ADD s2, s2, t4 - LD a4, 1 * SIZE(X) - - MOV t4, a8 - daddu X, X, INCX - - ADD s1, s1, t1 - LD a5, 0 * SIZE(X) - - MOV t1, a1 - NOP - - ADD s2, s2, t2 - LD a6, 1 * SIZE(X) - - MOV t2, a2 - daddu X, X, INCX - - ADD s1, s1, t3 - LD a7, 0 * SIZE(X) - - MOV t3, a3 - LD a8, 1 * SIZE(X) - - ADD s2, s2, t4 - daddu X, X, INCX - - bgtz I, .L23 - MOV t4, a4 - .align 3 - -.L24: - ADD s1, s1, t1 - MOV t1, a5 - - ADD s2, s2, t2 - MOV t2, a6 - - ADD s1, s1, t3 - MOV t3, a7 - - ADD s2, s2, t4 - MOV t4, a8 - - ADD s1, s1, t1 - ADD s2, s2, t2 - ADD s1, s1, t3 - ADD s2, s2, t4 - .align 3 - -.L25: - andi I, N, 3 - - blez I, .L999 - NOP - .align 3 - -.L26: - LD a1, 0 * SIZE(X) - LD a2, 1 * SIZE(X) - - MOV t1, a1 - daddiu I, I, -1 - MOV t2, a2 - daddu X, X, INCX - - ADD s1, s1, t1 - bgtz I, .L26 - ADD s2, s2, t2 - .align 3 - -.L999: - j $31 - ADD s1, s1, s2 - - EPILOGUE diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index 43f004fbb..cbcffb8fe 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c SGEMMITCOPY = sgemm_tcopy_16_power8.S SGEMMONCOPY = ../generic/gemm_ncopy_8.c SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +SGEMMINCOPYOBJ = sgemm_incopy.o +SGEMMITCOPYOBJ = sgemm_itcopy.o +SGEMMONCOPYOBJ = sgemm_oncopy.o +SGEMMOTCOPYOBJ = sgemm_otcopy.o DGEMMKERNEL = dgemm_kernel_16x4_power8.S DGEMMINCOPY = ../generic/gemm_ncopy_16.c DGEMMITCOPY = dgemm_tcopy_16_power8.S DGEMMONCOPY = dgemm_ncopy_4_power8.S DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy.o +DGEMMOTCOPYOBJ = dgemm_otcopy.o CGEMMKERNEL = cgemm_kernel_8x4_power8.S CGEMMINCOPY = ../generic/zgemm_ncopy_8.c CGEMMITCOPY = cgemm_tcopy_8_power8.S CGEMMONCOPY = ../generic/zgemm_ncopy_4.c CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy.o +CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy.o +CGEMMITCOPYOBJ = cgemm_itcopy.o ZGEMMKERNEL = zgemm_kernel_8x2_power8.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy.o +ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMINCOPYOBJ = zgemm_incopy.o +ZGEMMITCOPYOBJ = zgemm_itcopy.o STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c ZSWAPKERNEL = zswap.c # -SGEMVNKERNEL = sgemv_n.c +#SGEMVNKERNEL = ../arm/gemv_n.c DGEMVNKERNEL = dgemv_n.c -CGEMVNKERNEL = cgemv_n.c +#CGEMVNKERNEL = ../arm/zgemv_n.c ZGEMVNKERNEL = zgemv_n_4.c # -SGEMVTKERNEL = sgemv_t.c +#SGEMVTKERNEL = ../arm/gemv_t.c DGEMVTKERNEL = dgemv_t.c -CGEMVTKERNEL = cgemv_t.c +#CGEMVTKERNEL = ../arm/zgemv_t.c ZGEMVTKERNEL = zgemv_t_4.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 deleted file mode 100644 index a570a903a..000000000 --- a/kernel/power/KERNEL.POWER9 +++ /dev/null @@ -1,184 +0,0 @@ -#SGEMM_BETA = ../generic/gemm_beta.c -#DGEMM_BETA = ../generic/gemm_beta.c -#CGEMM_BETA = ../generic/zgemm_beta.c -#ZGEMM_BETA = ../generic/zgemm_beta.c - -STRMMKERNEL = sgemm_kernel_power9.S -DTRMMKERNEL = dgemm_kernel_power9.S -CTRMMKERNEL = cgemm_kernel_power9.S -ZTRMMKERNEL = zgemm_kernel_power9.S - -SGEMMKERNEL = sgemm_kernel_power9.S -SGEMMINCOPY = ../generic/gemm_ncopy_16.c -SGEMMITCOPY = sgemm_tcopy_16_power8.S -SGEMMONCOPY = ../generic/gemm_ncopy_8.c -SGEMMOTCOPY = sgemm_tcopy_8_power8.S -SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) -SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) -SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) -SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - -DGEMMKERNEL = dgemm_kernel_power9.S -DGEMMINCOPY = ../generic/gemm_ncopy_16.c -DGEMMITCOPY = dgemm_tcopy_16_power8.S -DGEMMONCOPY = dgemm_ncopy_4_power8.S -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) -DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) -DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) -DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) - -CGEMMKERNEL = cgemm_kernel_power9.S -CGEMMINCOPY = ../generic/zgemm_ncopy_8.c -CGEMMITCOPY = ../generic/zgemm_tcopy_8.c -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) -CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) -CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) -CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) - -ZGEMMKERNEL = zgemm_kernel_power9.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c -ZGEMMITCOPY = zgemm_tcopy_8_power8.S -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) -ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) -ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -#Todo: CGEMM3MKERNEL should be 4x4 blocksizes. -#CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S -#ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S - -#Pure C for other kernels -#SAMAXKERNEL = ../arm/amax.c -#DAMAXKERNEL = ../arm/amax.c -#CAMAXKERNEL = ../arm/zamax.c -#ZAMAXKERNEL = ../arm/zamax.c -# -#SAMINKERNEL = ../arm/amin.c -#DAMINKERNEL = ../arm/amin.c -#CAMINKERNEL = ../arm/zamin.c -#ZAMINKERNEL = ../arm/zamin.c -# -#SMAXKERNEL = ../arm/max.c -#DMAXKERNEL = ../arm/max.c -# -#SMINKERNEL = ../arm/min.c -#DMINKERNEL = ../arm/min.c -# -ISAMAXKERNEL = isamax.c -IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c -IZAMAXKERNEL = izamax.c -# -ISAMINKERNEL = isamin.c -IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c -IZAMINKERNEL = izamin.c -# -#ISMAXKERNEL = ../arm/imax.c -#IDMAXKERNEL = ../arm/imax.c -# -#ISMINKERNEL = ../arm/imin.c -#IDMINKERNEL = ../arm/imin.c -# -SASUMKERNEL = sasum.c -DASUMKERNEL = dasum.c -CASUMKERNEL = casum.c -ZASUMKERNEL = zasum.c -# -SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c -ZAXPYKERNEL = zaxpy.c -# -SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c -CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c -# -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -DSDOTKERNEL = sdot.c -CDOTKERNEL = cdot.c -ZDOTKERNEL = zdot.c -# -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c -# -SROTKERNEL = srot.c -DROTKERNEL = drot.c -CROTKERNEL = crot.c -ZROTKERNEL = zrot.c -# -SSCALKERNEL = sscal.c -DSCALKERNEL = dscal.c -CSCALKERNEL = zscal.c -ZSCALKERNEL = zscal.c -# -SSWAPKERNEL = sswap.c -DSWAPKERNEL = dswap.c -CSWAPKERNEL = cswap.c -ZSWAPKERNEL = zswap.c -# - -SGEMVNKERNEL = sgemv_n.c -DGEMVNKERNEL = dgemv_n.c -CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c -# -SGEMVTKERNEL = sgemv_t.c -DGEMVTKERNEL = dgemv_t.c -CGEMVTKERNEL = cgemv_t.c -ZGEMVTKERNEL = zgemv_t_4.c - - -#SSYMV_U_KERNEL = ../generic/symv_k.c -#SSYMV_L_KERNEL = ../generic/symv_k.c -#DSYMV_U_KERNEL = ../generic/symv_k.c -#DSYMV_L_KERNEL = ../generic/symv_k.c -#QSYMV_U_KERNEL = ../generic/symv_k.c -#QSYMV_L_KERNEL = ../generic/symv_k.c -#CSYMV_U_KERNEL = ../generic/zsymv_k.c -#CSYMV_L_KERNEL = ../generic/zsymv_k.c -#ZSYMV_U_KERNEL = ../generic/zsymv_k.c -#ZSYMV_L_KERNEL = ../generic/zsymv_k.c -#XSYMV_U_KERNEL = ../generic/zsymv_k.c -#XSYMV_L_KERNEL = ../generic/zsymv_k.c - -#ZHEMV_U_KERNEL = ../generic/zhemv_k.c -#ZHEMV_L_KERNEL = ../generic/zhemv_k.c - -LSAME_KERNEL = ../generic/lsame.c -SCABS_KERNEL = ../generic/cabs.c -DCABS_KERNEL = ../generic/cabs.c -QCABS_KERNEL = ../generic/cabs.c - -#Dump kernel -CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c -ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c diff --git a/kernel/power/axpy.S b/kernel/power/axpy.S index 238771826..fb9789da4 100644 --- a/kernel/power/axpy.S +++ b/kernel/power/axpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/axpy_ppc440.S b/kernel/power/axpy_ppc440.S index 7733e46e7..81a660e4d 100644 --- a/kernel/power/axpy_ppc440.S +++ b/kernel/power/axpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/casum.c b/kernel/power/casum.c index a9ece0768..d1108581d 100644 --- a/kernel/power/casum.c +++ b/kernel/power/casum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "casum_microk_power8.c" #endif diff --git a/kernel/power/ccopy.c b/kernel/power/ccopy.c index 50df84cc5..ce7d67475 100644 --- a/kernel/power/ccopy.c +++ b/kernel/power/ccopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "ccopy_microk_power8.c" #endif diff --git a/kernel/power/cgemm_kernel_8x4_power8.S b/kernel/power/cgemm_kernel_8x4_power8.S index 2bc99974f..8dbb6011d 100644 --- a/kernel/power/cgemm_kernel_8x4_power8.S +++ b/kernel/power/cgemm_kernel_8x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -265,7 +265,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -286,7 +286,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/cgemm_kernel_power9.S b/kernel/power/cgemm_kernel_power9.S deleted file mode 100644 index 4b5c2fa31..000000000 --- a/kernel/power/cgemm_kernel_power9.S +++ /dev/null @@ -1,293 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld -#define STACKSIZE (512 ) -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - -#define alpha_r vs19 -#define alpha_i vs20 -#define save_permute_1 vs21 -#define permute_mask vs22 -#define o0 0 - - -#define T1 r11 -#define T2 r12 -#define T3 r14 -#define T4 r15 -#define T5 r16 -#define T6 r17 -#define L r18 -#define T7 r19 -#define T8 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T9 r27 -#define T10 r28 -#define PRE r29 - -#define T12 r30 -#define T13 r31 - -#include "cgemm_macros_power9.S" - -.equ perm_const1, 0x0405060700010203 -.equ perm_const2, 0x0c0d0e0f08090a0b -.equ save_permute_12, 0x0c0d0e0f1c1d1e1f -.equ save_permute_11, 0x0405060714151617 - - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - - addi SP, SP, -STACKSIZE - mflr r0 - - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - std r0, FLINK_SAVE(SP) - - - - ld LDC, FRAMESLOT(0) + STACKSIZE(SP) - - - -#ifdef TRMMKERNEL - ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) -#endif - slwi LDC, LDC, ZBASE_SHIFT - - - - /*alpha is stored in f1. convert to single and splat*/ - xscvdpspn alpha_r,vs1 - xscvdpspn alpha_i,vs2 - xxspltw alpha_r,alpha_r,0 - xxspltw alpha_i,alpha_i,0 -/*load reverse permute mask for big endian - uint128 = 0xc0d0e0f08090a0b0405060700010203 -*/ - - lis T2, perm_const2@highest - lis T1, perm_const1@highest - lis T3, save_permute_12@highest - lis T4, save_permute_11@highest - - - ori T2, T2, perm_const2@higher - ori T1, T1, perm_const1@higher - ori T3, T3, save_permute_12@higher - ori T4, T4, save_permute_11@higher - - - rldicr T2, T2, 32, 31 - rldicr T1, T1, 32, 31 - rldicr T3, T3, 32, 31 - rldicr T4, T4, 32, 31 - - oris T2, T2, perm_const2@h - oris T1, T1, perm_const1@h - oris T3, T3, save_permute_12@h - oris T4, T4, save_permute_11@h - - - ori T2, T2, perm_const2@l - ori T1, T1, perm_const1@l - ori T3, T3, save_permute_12@l - ori T4, T4, save_permute_11@l - - - li r0,0 - li PRE,512 - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegsp alpha_r,alpha_r - xvnegsp alpha_i,alpha_i -#endif - - mtvsrdd permute_mask,T2,T1 - mtvsrdd save_permute_1,T3,T4 - - /*mask is reverse permute so we have to make it inner permute */ - xxpermdi permute_mask, permute_mask, permute_mask,2 - -#include "cgemm_logic_power9.S" - -.L999: - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - - EPILOGUE -#endif diff --git a/kernel/power/cgemm_logic_power9.S b/kernel/power/cgemm_logic_power9.S deleted file mode 100644 index b4f937e90..000000000 --- a/kernel/power/cgemm_logic_power9.S +++ /dev/null @@ -1,2816 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define MY_ALIGN .align 3 -b CGEMM_L4 -/* MINI SUBROUTINES */ -/* 4x8 MAIN 128x+2 LOOP */ - - -CGEMM_L4x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x8_2 - MY_ALIGN -CGEMM_L4x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 -CGEMM_L4x8_K128: -/*----------------------------------------*/ - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_L2 128,64,31,0 - KERNEL4x8_L2 128,64,32,0 - KERNEL4x8_L2 128,64,33,0 - KERNEL4x8_L2 128,64,34,0 - KERNEL4x8_L2 128,64,35,0 - KERNEL4x8_L2 128,64,36,0 - KERNEL4x8_L2 128,64,37,0 - KERNEL4x8_L2 128,64,38,0 - KERNEL4x8_L2 128,64,39,0 - KERNEL4x8_L2 128,64,40,0 - KERNEL4x8_L2 128,64,41,0 - KERNEL4x8_L2 128,64,42,0 - KERNEL4x8_L2 128,64,43,0 - KERNEL4x8_L2 128,64,44,0 - KERNEL4x8_L2 128,64,45,0 - KERNEL4x8_L2 128,64,46,0 - KERNEL4x8_L2 128,64,47,0 - KERNEL4x8_L2 128,64,48,0 - KERNEL4x8_L2 128,64,49,0 - KERNEL4x8_L2 128,64,50,0 - KERNEL4x8_L2 128,64,51,0 - KERNEL4x8_L2 128,64,52,0 - KERNEL4x8_L2 128,64,53,0 - KERNEL4x8_L2 128,64,54,0 - KERNEL4x8_L2 128,64,55,0 - KERNEL4x8_L2 128,64,56,0 - KERNEL4x8_L2 128,64,57,0 - KERNEL4x8_L2 128,64,58,0 - KERNEL4x8_L2 128,64,59,0 - KERNEL4x8_L2 128,64,60,0 - KERNEL4x8_L2 128,64,61,0 - KERNEL4x8_L2 128,64,62,0 - KERNEL4x8_L2 128,64,63,1 - bdnz CGEMM_L4x8_LOOP - MY_ALIGN -CGEMM_L4x8_LOOP_END: -/*----------------------------------------*/ - END4x8_2 - blr - MY_ALIGN - - -CGEMM_4x8_L64_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_L2 128,64,15,0 - KERNEL4x8_L2 128,64,16,0 - KERNEL4x8_L2 128,64,17,0 - KERNEL4x8_L2 128,64,18,0 - KERNEL4x8_L2 128,64,19,0 - KERNEL4x8_L2 128,64,20,0 - KERNEL4x8_L2 128,64,21,0 - KERNEL4x8_L2 128,64,22,0 - KERNEL4x8_L2 128,64,23,0 - KERNEL4x8_L2 128,64,24,0 - KERNEL4x8_L2 128,64,25,0 - KERNEL4x8_L2 128,64,26,0 - KERNEL4x8_L2 128,64,27,0 - KERNEL4x8_L2 128,64,28,0 - KERNEL4x8_L2 128,64,29,0 - KERNEL4x8_L2 128,64,30,0 - KERNEL4x8_E2 128,64,31,1 - blr - MY_ALIGN - - -CGEMM_4x8_L32_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_L2 128,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL4x8_L2 128,64,8,0 - KERNEL4x8_L2 128,64,9,0 - KERNEL4x8_L2 128,64,10,0 - KERNEL4x8_L2 128,64,11,0 - dcbt BO, T4 - KERNEL4x8_L2 128,64,12,0 - KERNEL4x8_L2 128,64,13,0 - KERNEL4x8_L2 128,64,14,0 - KERNEL4x8_E2 128,64,15,1 - blr - MY_ALIGN - - -CGEMM_4x8_L16_SUB: -/*----------------------------------------*/ - LOAD4x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL4x8_L2 128,64,0,0 - KERNEL4x8_L2 128,64,1,0 - dcbt AO, T2 - KERNEL4x8_L2 128,64,2,0 - KERNEL4x8_L2 128,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL4x8_L2 128,64,4,0 - KERNEL4x8_L2 128,64,5,0 - dcbt AO, T4 - KERNEL4x8_L2 128,64,6,0 - KERNEL4x8_E2 128,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x4_2 - MY_ALIGN -CGEMM_L4x4_LOOP: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,0,0 -CGEMM_L4x4_K32: -/*----------------------------------------*/ - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_L2 64,64,7,0 - KERNEL4x4_L2 64,64,8,0 - KERNEL4x4_L2 64,64,9,0 - KERNEL4x4_L2 64,64,10,0 - KERNEL4x4_L2 64,64,11,0 - KERNEL4x4_L2 64,64,12,0 - KERNEL4x4_L2 64,64,13,0 - KERNEL4x4_L2 64,64,14,0 - KERNEL4x4_L2 64,64,15,1 - bdnz CGEMM_L4x4_LOOP - MY_ALIGN -CGEMM_L4x4_LOOP_END: -/*----------------------------------------*/ - END4x4_2 - blr - MY_ALIGN - - -CGEMM_4x4_L16_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_L2 64,64,3,0 - KERNEL4x4_L2 64,64,4,0 - KERNEL4x4_L2 64,64,5,0 - KERNEL4x4_L2 64,64,6,0 - KERNEL4x4_E2 64,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x4_L8_SUB: -/*----------------------------------------*/ - LOAD4x4_2 - KERNEL4x4_L2 64,64,0,0 - KERNEL4x4_L2 64,64,1,0 - KERNEL4x4_L2 64,64,2,0 - KERNEL4x4_E2 64,64,3,1 - blr - - -CGEMM_4x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x2_2 - MY_ALIGN -CGEMM_L4x2_LOOP: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,0,0 -CGEMM_L4x2_K32: -/*----------------------------------------*/ - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_L2 32,64,7,0 - KERNEL4x2_L2 32,64,8,0 - KERNEL4x2_L2 32,64,9,0 - KERNEL4x2_L2 32,64,10,0 - KERNEL4x2_L2 32,64,11,0 - KERNEL4x2_L2 32,64,12,0 - KERNEL4x2_L2 32,64,13,0 - KERNEL4x2_L2 32,64,14,0 - KERNEL4x2_L2 32,64,15,1 - bdnz CGEMM_L4x2_LOOP - MY_ALIGN - - -CGEMM_L4x2_LOOP_END: -/*----------------------------------------*/ - END4x2_2 - blr - MY_ALIGN -CGEMM_4x2_L16_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_L2 32,64,3,0 - KERNEL4x2_L2 32,64,4,0 - KERNEL4x2_L2 32,64,5,0 - KERNEL4x2_L2 32,64,6,0 - KERNEL4x2_E2 32,64,7,1 - blr - MY_ALIGN -CGEMM_4x2_L8_SUB: -/*----------------------------------------*/ - LOAD4x2_2 - KERNEL4x2_L2 32,64,0,0 - KERNEL4x2_L2 32,64,1,0 - KERNEL4x2_L2 32,64,2,0 - KERNEL4x2_E2 32,64,3,1 - blr - - -CGEMM_4x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD4x1_2 - MY_ALIGN -CGEMM_L4x1_LOOP: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,0,0 -CGEMM_L4x1_K32: -/*----------------------------------------*/ - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_L2 16,64,7,0 - KERNEL4x1_L2 16,64,8,0 - KERNEL4x1_L2 16,64,9,0 - KERNEL4x1_L2 16,64,10,0 - KERNEL4x1_L2 16,64,11,0 - KERNEL4x1_L2 16,64,12,0 - KERNEL4x1_L2 16,64,13,0 - KERNEL4x1_L2 16,64,14,0 - KERNEL4x1_L2 16,64,15,1 - bdnz CGEMM_L4x1_LOOP - MY_ALIGN -CGEMM_L4x1_LOOP_END: -/*----------------------------------------*/ - END4x1_2 - blr - - MY_ALIGN -CGEMM_4x1_L16_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_L2 16,64,3,0 - KERNEL4x1_L2 16,64,4,0 - KERNEL4x1_L2 16,64,5,0 - KERNEL4x1_L2 16,64,6,0 - KERNEL4x1_E2 16,64,7,1 - blr - MY_ALIGN - - -CGEMM_4x1_L8_SUB: -/*----------------------------------------*/ - LOAD4x1_2 - KERNEL4x1_L2 16,64,0,0 - KERNEL4x1_L2 16,64,1,0 - KERNEL4x1_L2 16,64,2,0 - KERNEL4x1_E2 16,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L4: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 2 - ble CGEMM_L4_END - - -CGEMM_L4_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 2 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L4x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L4x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,4 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO4x8 - ble CGEMM_L4x8_SUB0 - bl CGEMM_L4x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L4x8_SAVE - b CGEMM_L4x8_SUB2 - - -CGEMM_L4x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP4x8_128K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD4x8O 64,32 - END4x8_WITHOUT_ADD - LOAD4x8_2O 128, 64 - mtctr T8 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - CMP4x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L4x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD4x8_2O 128,64 - bl CGEMM_L4x8_K128 - b CGEMM_L4x8_SAVE - MY_ALIGN - - -CGEMM_L4x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L4x8_SUB2_32 - bl CGEMM_4x8_L64_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L4x8_SUB2_16 - bl CGEMM_4x8_L32_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x8_SUB2_8 - bl CGEMM_4x8_L16_SUB - MY_ALIGN - - -CGEMM_L4x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x8_SUB2_4 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_L2 128,64, 1,0 - KERNEL4x8_L2 128,64, 2,0 - KERNEL4x8_E2 128,64, 3,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x8_SUB2_2 - LOAD4x8_2 - KERNEL4x8_L2 128,64, 0,0 - KERNEL4x8_E2 128,64, 1,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x8_SUB2_1 - LOAD4x8_2 - KERNEL4x8_E2 128,64, 0,1 - MY_ALIGN - - -CGEMM_L4x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x8_SAVE - KERNEL4x8 - - MY_ALIGN -CGEMM_L4x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,4 -#endif - bgt CGEMM_L4x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END - b CGEMM_L4x4_BEGIN - MY_ALIGN - - -CGEMM_L4x8_END: -/*----------------------------------------*/ - - -CGEMM_L4x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L4x1_END - andi. T1, M, 4 - ble CGEMM_L4x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x4 - ble CGEMM_L4x4_SUB0 - bl CGEMM_4x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x4_SAVE - b CGEMM_L4x4_SUB2 - - -CGEMM_L4x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x4_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD4x4O 32,32 - END4x4_WITHOUT_ADD - LOAD4x4_2O 64, 64 - mtctr T8 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - CMP4x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD4x4_2O 64,64 - bl CGEMM_L4x4_K32 - b CGEMM_L4x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x4_SUB2_8 - bl CGEMM_4x4_L16_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x4_SUB2_4 - bl CGEMM_4x4_L8_SUB - MY_ALIGN - - -CGEMM_L4x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x4_SUB2_2 - LOAD4x4_2 - KERNEL4x4_L2 64,64, 0,0 - KERNEL4x4_E2 64,64, 1,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x4_SUB2_1 - LOAD4x4_2 - KERNEL4x4_E2 64,64, 0,1 - MY_ALIGN - - -CGEMM_L4x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x4_SAVE - KERNEL4x4 - - -CGEMM_L4x4_SAVE: -/*----------------------------------------*/ - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,4 -#endif - - -CGEMM_L4x4_END: -/*----------------------------------------*/ - - -CGEMM_L4x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x2 - ble CGEMM_L4x2_SUB0 - bl CGEMM_4x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x2_SAVE - b CGEMM_L4x2_SUB2 - - -CGEMM_L4x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x2_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD4x2O 16,32 - END4x2_WITHOUT_ADD - LOAD4x2_2O 32, 64 - mtctr T8 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - CMP4x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD4x2_2O 32,64 - bl CGEMM_L4x2_K32 - b CGEMM_L4x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x2_SUB2_8 - bl CGEMM_4x2_L16_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x2_SUB2_4 - bl CGEMM_4x2_L8_SUB - MY_ALIGN - - -CGEMM_L4x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x2_SUB2_2 - LOAD4x2_2 - KERNEL4x2_L2 32,64, 0,0 - KERNEL4x2_E2 32,64, 1,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x2_SUB2_1 - LOAD4x2_2 - KERNEL4x2_E2 32,64, 0,1 - MY_ALIGN - - -CGEMM_L4x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x2_SAVE - KERNEL4x2 - - MY_ALIGN -CGEMM_L4x2_SAVE: -/*----------------------------------------*/ - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,4 -#endif - - -CGEMM_L4x2_END: -/*----------------------------------------*/ - - -CGEMM_L4x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,4 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO4x1 - ble CGEMM_L4x1_SUB0 - bl CGEMM_4x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L4x1_SAVE - b CGEMM_L4x1_SUB2 - - -CGEMM_L4x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP4x1_32K - addi BO,BO,-32 - addi AO,AO,-8 - LOAD4x1O 8,32 - END4x1_WITHOUT_ADD - LOAD4x1_2O 16, 64 - mtctr T8 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - CMP4x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L4x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-16 - LOAD4x1_2O 16,64 - bl CGEMM_L4x1_K32 - b CGEMM_L4x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L4x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L4x1_SUB2_8 - bl CGEMM_4x1_L16_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L4x1_SUB2_4 - bl CGEMM_4x1_L8_SUB - MY_ALIGN - - -CGEMM_L4x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L4x1_SUB2_2 - LOAD4x1_2 - KERNEL4x1_L2 16,64, 0,0 - KERNEL4x1_E2 16,64, 1,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L4x1_SUB2_1 - LOAD4x1_2 - KERNEL4x1_E2 16,64, 0,1 - MY_ALIGN - - -CGEMM_L4x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L4x1_SAVE - KERNEL4x1 - - MY_ALIGN -CGEMM_L4x1_SAVE: -/*----------------------------------------*/ - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,4 -#endif - - -CGEMM_L4x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - bgt CGEMM_L4_BEGIN - - -CGEMM_L4_END: - -b CGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -CGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -CGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 -CGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_L2 128,32,31,0 - KERNEL2x8_L2 128,32,32,0 - KERNEL2x8_L2 128,32,33,0 - KERNEL2x8_L2 128,32,34,0 - KERNEL2x8_L2 128,32,35,0 - KERNEL2x8_L2 128,32,36,0 - KERNEL2x8_L2 128,32,37,0 - KERNEL2x8_L2 128,32,38,0 - KERNEL2x8_L2 128,32,39,0 - KERNEL2x8_L2 128,32,40,0 - KERNEL2x8_L2 128,32,41,0 - KERNEL2x8_L2 128,32,42,0 - KERNEL2x8_L2 128,32,43,0 - KERNEL2x8_L2 128,32,44,0 - KERNEL2x8_L2 128,32,45,0 - KERNEL2x8_L2 128,32,46,0 - KERNEL2x8_L2 128,32,47,0 - KERNEL2x8_L2 128,32,48,0 - KERNEL2x8_L2 128,32,49,0 - KERNEL2x8_L2 128,32,50,0 - KERNEL2x8_L2 128,32,51,0 - KERNEL2x8_L2 128,32,52,0 - KERNEL2x8_L2 128,32,53,0 - KERNEL2x8_L2 128,32,54,0 - KERNEL2x8_L2 128,32,55,0 - KERNEL2x8_L2 128,32,56,0 - KERNEL2x8_L2 128,32,57,0 - KERNEL2x8_L2 128,32,58,0 - KERNEL2x8_L2 128,32,59,0 - KERNEL2x8_L2 128,32,60,0 - KERNEL2x8_L2 128,32,61,0 - KERNEL2x8_L2 128,32,62,0 - KERNEL2x8_L2 128,32,63,1 - bdnz CGEMM_L2x8_LOOP - MY_ALIGN -CGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -CGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_L2 128,32,15,0 - KERNEL2x8_L2 128,32,16,0 - KERNEL2x8_L2 128,32,17,0 - KERNEL2x8_L2 128,32,18,0 - KERNEL2x8_L2 128,32,19,0 - KERNEL2x8_L2 128,32,20,0 - KERNEL2x8_L2 128,32,21,0 - KERNEL2x8_L2 128,32,22,0 - KERNEL2x8_L2 128,32,23,0 - KERNEL2x8_L2 128,32,24,0 - KERNEL2x8_L2 128,32,25,0 - KERNEL2x8_L2 128,32,26,0 - KERNEL2x8_L2 128,32,27,0 - KERNEL2x8_L2 128,32,28,0 - KERNEL2x8_L2 128,32,29,0 - KERNEL2x8_L2 128,32,30,0 - KERNEL2x8_E2 128,32,31,1 - blr - MY_ALIGN - - -CGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_L2 128,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 128,32,8,0 - KERNEL2x8_L2 128,32,9,0 - KERNEL2x8_L2 128,32,10,0 - KERNEL2x8_L2 128,32,11,0 - dcbt BO, T4 - KERNEL2x8_L2 128,32,12,0 - KERNEL2x8_L2 128,32,13,0 - KERNEL2x8_L2 128,32,14,0 - KERNEL2x8_E2 128,32,15,1 - blr - MY_ALIGN - - -CGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 128,32,0,0 - KERNEL2x8_L2 128,32,1,0 - dcbt AO, T2 - KERNEL2x8_L2 128,32,2,0 - KERNEL2x8_L2 128,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 128,32,4,0 - KERNEL2x8_L2 128,32,5,0 - dcbt AO, T4 - KERNEL2x8_L2 128,32,6,0 - KERNEL2x8_E2 128,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -CGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,0,0 -CGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_L2 64,32,7,0 - KERNEL2x4_L2 64,32,8,0 - KERNEL2x4_L2 64,32,9,0 - KERNEL2x4_L2 64,32,10,0 - KERNEL2x4_L2 64,32,11,0 - KERNEL2x4_L2 64,32,12,0 - KERNEL2x4_L2 64,32,13,0 - KERNEL2x4_L2 64,32,14,0 - KERNEL2x4_L2 64,32,15,1 - bdnz CGEMM_L2x4_LOOP - MY_ALIGN -CGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -CGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_L2 64,32,3,0 - KERNEL2x4_L2 64,32,4,0 - KERNEL2x4_L2 64,32,5,0 - KERNEL2x4_L2 64,32,6,0 - KERNEL2x4_E2 64,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 64,32,0,0 - KERNEL2x4_L2 64,32,1,0 - KERNEL2x4_L2 64,32,2,0 - KERNEL2x4_E2 64,32,3,1 - blr - - -CGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -CGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,0,0 -CGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_L2 32,32,7,0 - KERNEL2x2_L2 32,32,8,0 - KERNEL2x2_L2 32,32,9,0 - KERNEL2x2_L2 32,32,10,0 - KERNEL2x2_L2 32,32,11,0 - KERNEL2x2_L2 32,32,12,0 - KERNEL2x2_L2 32,32,13,0 - KERNEL2x2_L2 32,32,14,0 - KERNEL2x2_L2 32,32,15,1 - bdnz CGEMM_L2x2_LOOP - MY_ALIGN - - -CGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -CGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_L2 32,32,3,0 - KERNEL2x2_L2 32,32,4,0 - KERNEL2x2_L2 32,32,5,0 - KERNEL2x2_L2 32,32,6,0 - KERNEL2x2_E2 32,32,7,1 - blr - MY_ALIGN -CGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 32,32,0,0 - KERNEL2x2_L2 32,32,1,0 - KERNEL2x2_L2 32,32,2,0 - KERNEL2x2_E2 32,32,3,1 - blr - - -CGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -CGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,0,0 -CGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_L2 16,32,7,0 - KERNEL2x1_L2 16,32,8,0 - KERNEL2x1_L2 16,32,9,0 - KERNEL2x1_L2 16,32,10,0 - KERNEL2x1_L2 16,32,11,0 - KERNEL2x1_L2 16,32,12,0 - KERNEL2x1_L2 16,32,13,0 - KERNEL2x1_L2 16,32,14,0 - KERNEL2x1_L2 16,32,15,1 - bdnz CGEMM_L2x1_LOOP - MY_ALIGN -CGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -CGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_L2 16,32,3,0 - KERNEL2x1_L2 16,32,4,0 - KERNEL2x1_L2 16,32,5,0 - KERNEL2x1_L2 16,32,6,0 - KERNEL2x1_E2 16,32,7,1 - blr - MY_ALIGN - - -CGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 16,32,0,0 - KERNEL2x1_L2 16,32,1,0 - KERNEL2x1_L2 16,32,2,0 - KERNEL2x1_E2 16,32,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L2: -/*----------------------------------------*/ - - andi. J, N, 2 - ble CGEMM_L2_END - - -CGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble CGEMM_L2x8_SUB0 - bl CGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L2x8_SAVE - b CGEMM_L2x8_SUB2 - - -CGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD2x8O 64,16 - END2x8_WITHOUT_ADD - LOAD2x8_2O 128, 32 - mtctr T8 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8_2O 128,32 - bl CGEMM_L2x8_K128 - b CGEMM_L2x8_SAVE - MY_ALIGN - - -CGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L2x8_SUB2_32 - bl CGEMM_2x8_L64_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L2x8_SUB2_16 - bl CGEMM_2x8_L32_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x8_SUB2_8 - bl CGEMM_2x8_L16_SUB - MY_ALIGN - - -CGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_L2 128,32, 1,0 - KERNEL2x8_L2 128,32, 2,0 - KERNEL2x8_E2 128,32, 3,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 128,32, 0,0 - KERNEL2x8_E2 128,32, 1,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 128,32, 0,1 - MY_ALIGN - - -CGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x8_SAVE - KERNEL2x8 - - MY_ALIGN -CGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt CGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END - b CGEMM_L2x4_BEGIN - MY_ALIGN - - -CGEMM_L2x8_END: -/*----------------------------------------*/ - - -CGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L2x1_END - andi. T1, M, 4 - ble CGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble CGEMM_L2x4_SUB0 - bl CGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x4_SAVE - b CGEMM_L2x4_SUB2 - - -CGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD2x4O 32,16 - END2x4_WITHOUT_ADD - LOAD2x4_2O 64, 32 - mtctr T8 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4_2O 64,32 - bl CGEMM_L2x4_K32 - b CGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x4_SUB2_8 - bl CGEMM_2x4_L16_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x4_SUB2_4 - bl CGEMM_2x4_L8_SUB - MY_ALIGN - - -CGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 64,32, 0,0 - KERNEL2x4_E2 64,32, 1,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 64,32, 0,1 - MY_ALIGN - - -CGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x4_SAVE - KERNEL2x4 - - -CGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -CGEMM_L2x4_END: -/*----------------------------------------*/ - - -CGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble CGEMM_L2x2_SUB0 - bl CGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x2_SAVE - b CGEMM_L2x2_SUB2 - - -CGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD2x2O 16,16 - END2x2_WITHOUT_ADD - LOAD2x2_2O 32, 32 - mtctr T8 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2_2O 32,32 - bl CGEMM_L2x2_K32 - b CGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x2_SUB2_8 - bl CGEMM_2x2_L16_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x2_SUB2_4 - bl CGEMM_2x2_L8_SUB - MY_ALIGN - - -CGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 32,32, 0,0 - KERNEL2x2_E2 32,32, 1,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 32,32, 0,1 - MY_ALIGN - - -CGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x2_SAVE - KERNEL2x2 - - MY_ALIGN -CGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -CGEMM_L2x2_END: -/*----------------------------------------*/ - - -CGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble CGEMM_L2x1_SUB0 - bl CGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L2x1_SAVE - b CGEMM_L2x1_SUB2 - - -CGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-16 - addi AO,AO,-8 - LOAD2x1O 8,16 - END2x1_WITHOUT_ADD - LOAD2x1_2O 16, 32 - mtctr T8 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1_2O 16,32 - bl CGEMM_L2x1_K32 - b CGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L2x1_SUB2_8 - bl CGEMM_2x1_L16_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L2x1_SUB2_4 - bl CGEMM_2x1_L8_SUB - MY_ALIGN - - -CGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 16,32, 0,0 - KERNEL2x1_E2 16,32, 1,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 16,32, 0,1 - MY_ALIGN - - -CGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L2x1_SAVE - KERNEL2x1 - - MY_ALIGN -CGEMM_L2x1_SAVE: -/*----------------------------------------*/ - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -CGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 4 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - -CGEMM_L2_END: - - -b CGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -CGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -CGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 -CGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_L2 128,16,31,0 - KERNEL1x8_L2 128,16,32,0 - KERNEL1x8_L2 128,16,33,0 - KERNEL1x8_L2 128,16,34,0 - KERNEL1x8_L2 128,16,35,0 - KERNEL1x8_L2 128,16,36,0 - KERNEL1x8_L2 128,16,37,0 - KERNEL1x8_L2 128,16,38,0 - KERNEL1x8_L2 128,16,39,0 - KERNEL1x8_L2 128,16,40,0 - KERNEL1x8_L2 128,16,41,0 - KERNEL1x8_L2 128,16,42,0 - KERNEL1x8_L2 128,16,43,0 - KERNEL1x8_L2 128,16,44,0 - KERNEL1x8_L2 128,16,45,0 - KERNEL1x8_L2 128,16,46,0 - KERNEL1x8_L2 128,16,47,0 - KERNEL1x8_L2 128,16,48,0 - KERNEL1x8_L2 128,16,49,0 - KERNEL1x8_L2 128,16,50,0 - KERNEL1x8_L2 128,16,51,0 - KERNEL1x8_L2 128,16,52,0 - KERNEL1x8_L2 128,16,53,0 - KERNEL1x8_L2 128,16,54,0 - KERNEL1x8_L2 128,16,55,0 - KERNEL1x8_L2 128,16,56,0 - KERNEL1x8_L2 128,16,57,0 - KERNEL1x8_L2 128,16,58,0 - KERNEL1x8_L2 128,16,59,0 - KERNEL1x8_L2 128,16,60,0 - KERNEL1x8_L2 128,16,61,0 - KERNEL1x8_L2 128,16,62,0 - KERNEL1x8_L2 128,16,63,1 - bdnz CGEMM_L1x8_LOOP - MY_ALIGN -CGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -CGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_L2 128,16,15,0 - KERNEL1x8_L2 128,16,16,0 - KERNEL1x8_L2 128,16,17,0 - KERNEL1x8_L2 128,16,18,0 - KERNEL1x8_L2 128,16,19,0 - KERNEL1x8_L2 128,16,20,0 - KERNEL1x8_L2 128,16,21,0 - KERNEL1x8_L2 128,16,22,0 - KERNEL1x8_L2 128,16,23,0 - KERNEL1x8_L2 128,16,24,0 - KERNEL1x8_L2 128,16,25,0 - KERNEL1x8_L2 128,16,26,0 - KERNEL1x8_L2 128,16,27,0 - KERNEL1x8_L2 128,16,28,0 - KERNEL1x8_L2 128,16,29,0 - KERNEL1x8_L2 128,16,30,0 - KERNEL1x8_E2 128,16,31,1 - blr - MY_ALIGN - - -CGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_L2 128,16,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 128,16,8,0 - KERNEL1x8_L2 128,16,9,0 - KERNEL1x8_L2 128,16,10,0 - KERNEL1x8_L2 128,16,11,0 - dcbt BO, T4 - KERNEL1x8_L2 128,16,12,0 - KERNEL1x8_L2 128,16,13,0 - KERNEL1x8_L2 128,16,14,0 - KERNEL1x8_E2 128,16,15,1 - blr - MY_ALIGN - - -CGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 128,16,0,0 - KERNEL1x8_L2 128,16,1,0 - dcbt AO, T2 - KERNEL1x8_L2 128,16,2,0 - KERNEL1x8_L2 128,16,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 128,16,4,0 - KERNEL1x8_L2 128,16,5,0 - dcbt AO, T4 - KERNEL1x8_L2 128,16,6,0 - KERNEL1x8_E2 128,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN -CGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,0,0 -CGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_L2 64,16,7,0 - KERNEL1x4_L2 64,16,8,0 - KERNEL1x4_L2 64,16,9,0 - KERNEL1x4_L2 64,16,10,0 - KERNEL1x4_L2 64,16,11,0 - KERNEL1x4_L2 64,16,12,0 - KERNEL1x4_L2 64,16,13,0 - KERNEL1x4_L2 64,16,14,0 - KERNEL1x4_L2 64,16,15,1 - bdnz CGEMM_L1x4_LOOP - MY_ALIGN -CGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -CGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_L2 64,16,3,0 - KERNEL1x4_L2 64,16,4,0 - KERNEL1x4_L2 64,16,5,0 - KERNEL1x4_L2 64,16,6,0 - KERNEL1x4_E2 64,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 64,16,0,0 - KERNEL1x4_L2 64,16,1,0 - KERNEL1x4_L2 64,16,2,0 - KERNEL1x4_E2 64,16,3,1 - blr - - -CGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN -CGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,0,0 -CGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_L2 32,16,7,0 - KERNEL1x2_L2 32,16,8,0 - KERNEL1x2_L2 32,16,9,0 - KERNEL1x2_L2 32,16,10,0 - KERNEL1x2_L2 32,16,11,0 - KERNEL1x2_L2 32,16,12,0 - KERNEL1x2_L2 32,16,13,0 - KERNEL1x2_L2 32,16,14,0 - KERNEL1x2_L2 32,16,15,1 - bdnz CGEMM_L1x2_LOOP - MY_ALIGN - - -CGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN -CGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_L2 32,16,3,0 - KERNEL1x2_L2 32,16,4,0 - KERNEL1x2_L2 32,16,5,0 - KERNEL1x2_L2 32,16,6,0 - KERNEL1x2_E2 32,16,7,1 - blr - MY_ALIGN -CGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 32,16,0,0 - KERNEL1x2_L2 32,16,1,0 - KERNEL1x2_L2 32,16,2,0 - KERNEL1x2_E2 32,16,3,1 - blr - - -CGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN -CGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,0,0 -CGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_L2 16,16,7,0 - KERNEL1x1_L2 16,16,8,0 - KERNEL1x1_L2 16,16,9,0 - KERNEL1x1_L2 16,16,10,0 - KERNEL1x1_L2 16,16,11,0 - KERNEL1x1_L2 16,16,12,0 - KERNEL1x1_L2 16,16,13,0 - KERNEL1x1_L2 16,16,14,0 - KERNEL1x1_L2 16,16,15,1 - bdnz CGEMM_L1x1_LOOP - MY_ALIGN -CGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - - MY_ALIGN -CGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_L2 16,16,3,0 - KERNEL1x1_L2 16,16,4,0 - KERNEL1x1_L2 16,16,5,0 - KERNEL1x1_L2 16,16,6,0 - KERNEL1x1_E2 16,16,7,1 - blr - MY_ALIGN - - -CGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 16,16,0,0 - KERNEL1x1_L2 16,16,1,0 - KERNEL1x1_L2 16,16,2,0 - KERNEL1x1_E2 16,16,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -CGEMM_L1: -/*----------------------------------------*/ - - andi. J, N, 1 - ble CGEMM_L1_END - -CGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble CGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -CGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T1-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble CGEMM_L1x8_SUB0 - bl CGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble CGEMM_L1x8_SAVE - b CGEMM_L1x8_SUB2 - - -CGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-8 - addi AO,AO,-64 - LOAD1x8O 64,8 - END1x8_WITHOUT_ADD - LOAD1x8_2O 128, 16 - mtctr T8 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne CGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8_2O 128,16 - bl CGEMM_L1x8_K128 - b CGEMM_L1x8_SAVE - MY_ALIGN - - -CGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble CGEMM_L1x8_SUB2_32 - bl CGEMM_1x8_L64_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble CGEMM_L1x8_SUB2_16 - bl CGEMM_1x8_L32_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x8_SUB2_8 - bl CGEMM_1x8_L16_SUB - MY_ALIGN - - -CGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_L2 128,16, 1,0 - KERNEL1x8_L2 128,16, 2,0 - KERNEL1x8_E2 128,16, 3,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 128,16, 0,0 - KERNEL1x8_E2 128,16, 1,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 128,16, 0,1 - MY_ALIGN - - -CGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x8_SAVE - KERNEL1x8 - - MY_ALIGN -CGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - MY_ALIGN - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt CGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END - b CGEMM_L1x4_BEGIN - MY_ALIGN - - -CGEMM_L1x8_END: -/*----------------------------------------*/ - - -CGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble CGEMM_L1x1_END - andi. T1, M, 4 - ble CGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x4 - ble CGEMM_L1x4_SUB0 - bl CGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x4_SAVE - b CGEMM_L1x4_SUB2 - - -CGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-8 - addi AO,AO,-32 - LOAD1x4O 32,8 - END1x4_WITHOUT_ADD - LOAD1x4_2O 64, 16 - mtctr T8 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4_2O 64,16 - bl CGEMM_L1x4_K32 - b CGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x4_SUB2_8 - bl CGEMM_1x4_L16_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x4_SUB2_4 - bl CGEMM_1x4_L8_SUB - MY_ALIGN - - -CGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 64,16, 0,0 - KERNEL1x4_E2 64,16, 1,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 64,16, 0,1 - MY_ALIGN - - -CGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x4_SAVE - KERNEL1x4 - - -CGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -CGEMM_L1x4_END: -/*----------------------------------------*/ - - -CGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble CGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x2 - ble CGEMM_L1x2_SUB0 - bl CGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x2_SAVE - b CGEMM_L1x2_SUB2 - - -CGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-8 - addi AO,AO,-16 - LOAD1x2O 16,8 - END1x2_WITHOUT_ADD - LOAD1x2_2O 32, 16 - mtctr T8 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2_2O 32,16 - bl CGEMM_L1x2_K32 - b CGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x2_SUB2_8 - bl CGEMM_1x2_L16_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x2_SUB2_4 - bl CGEMM_1x2_L8_SUB - MY_ALIGN - - -CGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 32,16, 0,0 - KERNEL1x2_E2 32,16, 1,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 32,16, 0,1 - MY_ALIGN - - -CGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x2_SAVE - KERNEL1x2 - - MY_ALIGN -CGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -CGEMM_L1x2_END: -/*----------------------------------------*/ - - -CGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble CGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T1-2) % 31x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 31x */ -#endif - ZERO1x1 - ble CGEMM_L1x1_SUB0 - bl CGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble CGEMM_L1x1_SAVE - b CGEMM_L1x1_SUB2 - - -CGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-8 - addi AO,AO,-8 - LOAD1x1O 8,8 - END1x1_WITHOUT_ADD - LOAD1x1_2O 16, 16 - mtctr T8 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne CGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1_2O 16,16 - bl CGEMM_L1x1_K32 - b CGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -CGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble CGEMM_L1x1_SUB2_8 - bl CGEMM_1x1_L16_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble CGEMM_L1x1_SUB2_4 - bl CGEMM_1x1_L8_SUB - MY_ALIGN - - -CGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble CGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 16,16, 0,0 - KERNEL1x1_E2 16,16, 1,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble CGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 16,16, 0,1 - MY_ALIGN - - -CGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble CGEMM_L1x1_SAVE - KERNEL1x1 - - MY_ALIGN -CGEMM_L1x1_SAVE: -/*----------------------------------------*/ - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -CGEMM_L1x1_END: -/*----------------------------------------*/ - slwi T1, K, 3 - - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - -CGEMM_L1_END: - - - - diff --git a/kernel/power/cgemm_macros_power9.S b/kernel/power/cgemm_macros_power9.S deleted file mode 100644 index a256e1a01..000000000 --- a/kernel/power/cgemm_macros_power9.S +++ /dev/null @@ -1,3019 +0,0 @@ - -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@gmail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - - -.macro AGGREGATE_REALS_IMAGES_A_PERMUTE VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvaddsp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubsp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead to fix sign*/ - xvaddsp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm - -/* {i0,i1} * {alpha_i,alpha_i} [- VSOUT1] ;[VSOUT2 +] {r0,r1}*{alpha_i,alpha_i} */ - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmulsp \VSOUT1,\VSINII, alpha_i - xvmulsp \VSOUT2,\VSINRR, alpha_i -.endm - -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubasp \VSOUT1,\VSINRR, alpha_r - xvmaddasp \VSOUT2,\VSINII, alpha_r -.endm - -/* macros for N=4 and M=8 -**********************************************************************************************/ - -.macro Zero4x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD4x8 - LOAD4x8O 0,0 -.endm - - -.macro LOAD4x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_NORMAL - END4x8 AO,BO,64,32 -.endm - - -.macro END4x8_WITHOUT_ADD - END4x8 AO,BO,0,0 -.endm - - -.macro END4x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.endm - - -.macro LOAD4x8_2 - LOAD4x8_2O 0,0 -.endm - - -.macro LOAD4x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x8_2 - /*for load2 offset will be 128 and 64*/ - KERNEL4x8_2 AO,BO, 128,64,0 ,1,1 -.endm - - -.macro KERNEL4x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs50, vs6,vs12 - xvmaddasp vs51, vs7,vs12 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs58, vs6,vs14 - xvmaddasp vs59, vs7,vs14 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 - xvmaddasp vs54, vs6,vs13 - xvmaddasp vs55, vs7,vs13 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 - xvmaddasp vs62, vs6,vs15 - xvmaddasp vs63, vs7,vs15 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs50, vs2,vs28 - xvmaddasp vs51, vs3,vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs58, vs2,vs30 - xvmaddasp vs59, vs3,vs30 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs54, vs2,vs29 - xvmaddasp vs55, vs3,vs29 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 - xvmaddasp vs62, vs2,vs31 - xvmaddasp vs63, vs3,vs31 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL4x8 - LOAD4x8 - END4x8 AO, BO, 64,32 -.endm - - -.macro SAVE4x8 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - xxperm vs2,vs50,permute_mask - xxperm vs6,vs58,permute_mask - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - xxperm vs3,vs51,permute_mask - xxperm vs7,vs59,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs50,vs2,vs58,vs6 - xxperm vs10,vs54,permute_mask - xxperm vs14,vs62,permute_mask - AGGREGATE_REALS_IMAGES vs51,vs3,vs59,vs7 - xxperm vs11,vs55,permute_mask - xxperm vs15,vs63,permute_mask - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - AGGREGATE_REALS_IMAGES vs54,vs10,vs62,vs14 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - AGGREGATE_REALS_IMAGES vs55,vs11,vs63,vs15 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - #ifndef TRMMKERNEL - lxv vs32 , 0(T2) - lxv vs40 , 16(T2) -#endif - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs33 , 32(T2) - lxv vs41 , 48(T2) -#endif - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 -#ifndef TRMMKERNEL - lxv vs34 , 0(T3) - lxv vs42 , 16(T3) -#endif - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -#ifndef TRMMKERNEL - lxv vs35 , 32(T3) - lxv vs43 , 48(T3) -#endif - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - MULT_APLHA_PART1 vs48,vs56,vs0,vs1 - MULT_APLHA_PART1 vs49,vs57,vs2,vs3 - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - MULT_APLHA_PART1 vs50,vs58,vs4,vs5 - MULT_APLHA_PART1 vs51,vs59,vs6,vs7 - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - MULT_APLHA_PART2 vs48,vs56,vs0,vs1 - MULT_APLHA_PART2 vs49,vs57,vs2,vs3 - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - MULT_APLHA_PART2 vs50,vs58,vs4,vs5 - MULT_APLHA_PART2 vs51,vs59,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs8,vs9 - MULT_APLHA_PART1 vs53,vs61,vs10,vs11 - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - MULT_APLHA_PART1 vs54,vs62,vs12,vs13 - MULT_APLHA_PART1 vs55,vs63,vs14,vs15 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - MULT_APLHA_PART2 vs52,vs60,vs8,vs9 - MULT_APLHA_PART2 vs53,vs61,vs10,vs11 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - MULT_APLHA_PART2 vs54,vs62,vs12,vs13 - MULT_APLHA_PART2 vs55,vs63,vs14,vs15 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs32,vs32,vs1 - xvaddsp vs40,vs40,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs33,vs33,vs5 - xvaddsp vs41,vs41,vs7 - xvaddsp vs34,vs34,vs9 - xvaddsp vs42,vs42,vs11 - xvaddsp vs35,vs35,vs13 - xvaddsp vs43,vs43,vs15 -#else - xxpermdi vs32,vs8,vs0,2 - xxpermdi vs40,vs10,vs2,2 - xxpermdi vs33,vs12,vs4,2 - xxpermdi vs41,vs14,vs6,2 - xxpermdi vs34,vs0,vs8,2 - xxpermdi vs42,vs2,vs10,2 - xxpermdi vs35,vs4,vs12,2 - xxpermdi vs43,vs6,vs14,2 -#endif - stxv vs32 , 0(T2) - stxv vs40 , 16(T2) - stxv vs33 , 32(T2) - stxv vs41 , 48(T2) - stxv vs34 , 0(T3) - stxv vs42 , 16(T3) - stxv vs35 , 32(T3) - stxv vs43 , 48(T3) - addi CO, CO, 64 -.endm - -/* macros for N=4 and M=4 -**********************************************************************************************/ - -.macro Zero4x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 -.endm - - -.macro LOAD4x4 - LOAD4x4O 0,0 -.endm - - -.macro LOAD4x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs28, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_NORMAL - END4x4 AO,BO,32,32 -.endm - - -.macro END4x4_WITHOUT_ADD - END4x4 AO,BO,0,0 -.endm - - -.macro END4x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.endm - - -.macro LOAD4x4_2 - LOAD4x4_2O 0,0 -.endm - - -.macro LOAD4x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs12, (16+\OffsetB)(BO) - lxv vs24, (32+\OffsetB)(BO) - lxv vs28, (32+16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endm - - -.macro END4x4_2 - /*for load2 offset will be 64 and 64*/ - KERNEL4x4_2 AO,BO, 64,64,0 ,1,1 -.endm - - -.macro KERNEL4x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs48, vs4,vs12 - xvmaddasp vs49, vs5,vs12 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs56, vs4,vs14 - xvmaddasp vs57, vs5,vs14 -.if \Complete==0 - lxv vs8, DISP8(\Index,\OffsetB)(\BREG) - lxv vs12, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs52, vs4,vs13 - xvmaddasp vs53, vs5,vs13 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxperm vs14, vs12, permute_mask -.endif - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 - xvmaddasp vs60, vs4,vs15 - xvmaddasp vs61, vs5,vs15 -.if \Complete==0 - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs13, vs12, vs12,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs15, vs14, vs14,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs48, vs0,vs28 - xvmaddasp vs49, vs1,vs28 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs56, vs0,vs30 - xvmaddasp vs57, vs1,vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs28, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs52, vs0,vs29 - xvmaddasp vs53, vs1,vs29 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxperm vs30, vs28, permute_mask -.endif - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs60, vs0,vs31 - xvmaddasp vs61, vs1,vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs29, vs28, vs28,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 - xxpermdi vs31, vs30, vs30,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP8(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP8(\Index,64) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x4 - LOAD4x4 - END4x4 AO, BO, 32,32 -.endm - - -.macro SAVE4x4 - add T4, LDC,LDC - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - #ifndef TRMMKERNEL - lxv vs28 , 0(T2) - lxv vs29 , 16(T2) -#endif -#ifndef TRMMKERNEL - lxv vs30 , 0(T3) - lxv vs31 , 16(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - xxperm vs0,vs48,permute_mask - xxperm vs4,vs56,permute_mask - xxperm vs1,vs49,permute_mask - xxperm vs5,vs57,permute_mask - xxperm vs8,vs52,permute_mask - xxperm vs12,vs60,permute_mask - xxperm vs9,vs53,permute_mask - xxperm vs13,vs61,permute_mask - AGGREGATE_REALS_IMAGES vs48,vs0,vs56,vs4 - AGGREGATE_REALS_IMAGES vs49,vs1,vs57,vs5 - AGGREGATE_REALS_IMAGES vs52,vs8,vs60,vs12 - AGGREGATE_REALS_IMAGES vs53,vs9,vs61,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs48,vs56,vs4,vs5 - MULT_APLHA_PART1 vs49,vs57,vs6,vs7 - MULT_APLHA_PART1 vs52,vs60,vs12,vs13 - MULT_APLHA_PART1 vs53,vs61,vs14,vs15 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs48,vs56,vs4,vs5 - MULT_APLHA_PART2 vs49,vs57,vs6,vs7 - MULT_APLHA_PART2 vs52,vs60,vs12,vs13 - MULT_APLHA_PART2 vs53,vs61,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 - xvaddsp vs28,vs28,vs5 - xvaddsp vs29,vs29,vs7 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 - xxpermdi vs28,vs12,vs4,2 - xxpermdi vs29,vs14,vs6,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - stxv vs28 , 0(T2) - stxv vs29 , 16(T2) - stxv vs30 , 0(T3) - stxv vs31 , 16(T3) - addi CO, CO, 32 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD4x2 - LOAD4x2O 0,0 -.endm - - -.macro LOAD4x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_NORMAL - END4x2 AO,BO,16,32 -.endm - - -.macro END4x2_WITHOUT_ADD - END4x2 AO,BO,0,0 -.endm - - -.macro END4x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD4x2_2 - LOAD4x2_2O 0,0 -.endm - - -.macro LOAD4x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END4x2_2 - /*for load2 offset will be 32 and 64*/ - KERNEL4x2_2 AO,BO, 32,64,0 ,1,1 -.endm - - -.macro KERNEL4x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x2 - LOAD4x2 - END4x2 AO, BO, 16,32 -.endm - - -.macro SAVE4x2 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs25 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxv vs27 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs10,vs2,0 - xxpermdi vs3,vs0,vs8,3 - xxpermdi vs11,vs2,vs10,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 - xvaddsp vs25,vs25,vs3 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs10,vs2,0 - xxpermdi vs25,vs0,vs8,3 - xxpermdi vs27,vs2,vs10,3 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 0(T1) - stxv vs26 , 0(T2) - stxv vs27 , 0(T3) - addi CO, CO, 16 -.endm - -/* macros for N=4 and M=2 -**********************************************************************************************/ - -.macro Zero4x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD4x1 - LOAD4x1O 0,0 -.endm - - -.macro LOAD4x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - lxv vs1, (\OffsetB+16)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END4x1_NORMAL - END4x1 AO,BO,8,32 -.endm - - -.macro END4x1_WITHOUT_ADD - END4x1 AO,BO,0,0 -.endm - - -.macro END4x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD4x1_2 - LOAD4x1_2O 0,0 -.endm - - -.macro LOAD4x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs4, (0+\OffsetB)(BO) - lxv vs5, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs0, (32+\OffsetB)(BO) - lxv vs1, (32+16+\OffsetB)(BO) -.endm - - -.macro END4x1_2 - /*for load2 offset will be 16 and 64*/ - KERNEL4x1_2 AO,BO, 16,64,0 ,1,1 -.endm - - -.macro KERNEL4x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL4x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL4x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetB)(\BREG) - lxv vs5, DISP8(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetB)(\BREG) - lxv vs1, DISP8(\Index,32+16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP8(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL4x1 - LOAD4x1 - END4x1 AO, BO, 8,32 -.endm - - -.macro SAVE4x1 - add T4, LDC,LDC - add T1, CO ,LDC - add T2,CO,T4 - add T3,T1,T4 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif -#ifndef TRMMKERNEL - lxsd v6 , 0(T2) -#endif -#ifndef TRMMKERNEL - lxsd v7 , 0(T3) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - xxspltd vs9,vs2,0 - xxspltd vs11,vs2,1 - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 - xvaddsp vs38,vs38,vs9 - xvaddsp vs39,vs39,vs11 -#else - /*--v4==vs36 v5==vs37 v6==vs38 v7==vs39---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 - xxspltd vs38,vs2,0 - xxspltd vs39,vs2,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - stxsd v6 , 0(T2) - stxsd v7 , 0(T3) - addi CO, CO, 8 -.endm - -/* macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,64,16 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x8_2 - /*for load2 offset will be 128 and 32*/ - KERNEL2x8_2 AO,BO, 128,32,0 ,1,1 -.endm - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 - xvmaddasp vs38, vs6,vs9 - xvmaddasp vs39, vs7,vs9 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif - xvmaddasp vs46, vs6,vs11 - xvmaddasp vs47, vs7,vs11 -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 - xvmaddasp vs38, vs2,vs25 - xvmaddasp vs39, vs3,vs25 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif - xvmaddasp vs46, vs2,vs27 - xvmaddasp vs47, vs3,vs27 -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 64,16 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask -#ifndef TRMMKERNEL - lxv vs28 , 0(T1) - lxv vs29 , 16(T1) -#endif - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask -#ifndef TRMMKERNEL - lxv vs30 , 32(T1) - lxv vs31 , 48(T1) -#endif - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - add T2,CO,T4 - add T3,T1,T4 - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - xxperm vs10,vs38,permute_mask - xxperm vs14,vs46,permute_mask - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - xxperm vs11,vs39,permute_mask - xxperm vs15,vs47,permute_mask - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - AGGREGATE_REALS_IMAGES vs38,vs10,vs46,vs14 - AGGREGATE_REALS_IMAGES vs39,vs11,vs47,vs15 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART1 vs38,vs46,vs12,vs13 - MULT_APLHA_PART1 vs39,vs47,vs14,vs15 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs38,vs46,vs12,vs13 - MULT_APLHA_PART2 vs39,vs47,vs14,vs15 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs4,vs5, save_permute_1 - xxperm vs6,vs7, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 - xxperm vs12,vs13, save_permute_1 - xxperm vs14,vs15, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs5,vs12,vs4,2 - xxpermdi vs7,vs14,vs6,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xxpermdi vs13,vs4,vs12,2 - xxpermdi vs15,vs6,vs14,2 - xvaddsp vs26,vs26,vs5 - xvaddsp vs27,vs27,vs7 - xvaddsp vs28,vs28,vs9 - xvaddsp vs29,vs29,vs11 - xvaddsp vs30,vs30,vs13 - xvaddsp vs31,vs31,vs15 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs12,vs4,2 - xxpermdi vs27,vs14,vs6,2 - xxpermdi vs28,vs0,vs8,2 - xxpermdi vs29,vs2,vs10,2 - xxpermdi vs30,vs4,vs12,2 - xxpermdi vs31,vs6,vs14,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) - stxv vs28 , 0(T1) - stxv vs29 , 16(T1) - stxv vs30 , 32(T1) - stxv vs31 , 48(T1) - addi CO, CO, 64 -.endm - -/* macros for N=2 and M=4 -**********************************************************************************************/ - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs24, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,32,16 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs24, (16+\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxpermdi vs25, vs24, vs24,2 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x4_2 - /*for load2 offset will be 64 and 32*/ - KERNEL2x4_2 AO,BO, 64,32,0 ,1,1 -.endm - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs37, vs5,vs9 - xvmaddasp vs44, vs4,vs11 - xvmaddasp vs45, vs5,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs37, vs1,vs25 - xvmaddasp vs44, vs0,vs27 - xvmaddasp vs45, vs1,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP4(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP4(\Index,32) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 32,16 -.endm - - -.macro SAVE2x4 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) - lxv vs27 , 16(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - xxperm vs9,vs37,permute_mask - xxperm vs13,vs45,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs36,vs8,vs44,vs12 - AGGREGATE_REALS_IMAGES vs37,vs9,vs45,vs13 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART1 vs37,vs45,vs10,vs11 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs37,vs45,vs10,vs11 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs2,vs3, save_permute_1 - xxperm vs8,vs9, save_permute_1 - xxperm vs10,vs11, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,2 - xxpermdi vs3,vs10,vs2,2 - xxpermdi vs9,vs0,vs8,2 - xxpermdi vs11,vs2,vs10,2 - xvaddsp vs24,vs24,vs1 - xvaddsp vs25,vs25,vs3 - xvaddsp vs26,vs26,vs9 - xvaddsp vs27,vs27,vs11 -#else - xxpermdi vs24,vs8,vs0,2 - xxpermdi vs25,vs10,vs2,2 - xxpermdi vs26,vs0,vs8,2 - xxpermdi vs27,vs2,vs10,2 -#endif - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 0(T1) - stxv vs27 , 16(T1) - addi CO, CO, 32 -.endm - -/* macros for N=2 and M=2 -**********************************************************************************************/ - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs36, vs36, vs36 - xxlxor vs40, vs40, vs40 - xxlxor vs44, vs44, vs44 -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs24, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,16,16 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs44, vs0,vs27 -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs8, (\OffsetA)(AO) - lxv vs24, (16+\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs11, vs10, vs10,2 - xxpermdi vs27, vs26, vs26,2 -.endm - - -.macro END2x2_2 - /*for load2 offset will be 32 and 32*/ - KERNEL2x2_2 AO,BO, 32,32,0 ,1,1 -.endm - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP4(\Index,\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs4,vs9 - xvmaddasp vs44, vs4,vs11 -.if \Complete==0 - xxperm vs10, vs8, permute_mask - xxpermdi vs9, vs8, vs8,2 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs11, vs10, vs10,2 -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs24, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - xvmaddasp vs36, vs0,vs25 - xvmaddasp vs44, vs0,vs27 -.if \Complete==0 - xxperm vs26, vs24, permute_mask - xxpermdi vs25, vs24, vs24,2 -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxpermdi vs27, vs26, vs26,2 -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,32) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 16,16 -.endm - - -.macro SAVE2x2 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxv vs26 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs8,vs36,permute_mask - xxperm vs12,vs44,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs36,vs8,vs44,vs12 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs36,vs44,vs8,vs9 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs36,vs44,vs8,vs9 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 - xxperm vs8,vs9, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxpermdi vs1,vs8,vs0,0 - xxpermdi vs9,vs0,vs8,3 - xvaddsp vs24,vs24,vs1 - xvaddsp vs26,vs26,vs9 -#else - xxpermdi vs24,vs8,vs0,0 - xxpermdi vs26,vs0,vs8,3 -#endif - stxv vs24 , 0(CO) - stxv vs26 , 0(T1) - addi CO, CO, 16 -.endm - -/* macros for N=2 and M=1 -**********************************************************************************************/ - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxsd v4, (\OffsetA+0)(AO) - lxv vs0, (\OffsetB+0)(BO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,8,16 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs27, (\OffsetA)(AO) - lxv vs4, (0+\OffsetB)(BO) - lxv vs0, (16+\OffsetB)(BO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END2x1_2 - /*for load2 offset will be 16 and 32*/ - KERNEL2x1_2 AO,BO, 16,32,0 ,1,1 -.endm - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetA)(\AREG) - xxspltd vs8,vs27,1 -.endif -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetB)(\BREG) -.endif - -.if \Complete==0 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetB)(\BREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,16) - addi \BREG, \BREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 8,16 -.endm - - -.macro SAVE2x1 - add T1, CO ,LDC -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif -#ifndef TRMMKERNEL - lxsd v5 , 0(T1) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES_A_PERMUTE vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES_A_PERMUTE vs33,vs1,vs41,vs5 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, save_permute_1 -#ifndef TRMMKERNEL - /* add */ - xxspltd vs1,vs0,0 - xxspltd vs3,vs0,1 - /*--v4==vs36 v5==vs37---*/ - xvaddsp vs36,vs36,vs1 - xvaddsp vs37,vs37,vs3 -#else - /*--v4==vs36 v5==vs37---*/ - xxspltd vs36,vs0,0 - xxspltd vs37,vs0,1 -#endif - stxsd v4 , 0(CO) - stxsd v5 , 0(T1) - addi CO, CO, 8 -.endm - -/* macros for N=1 and M=8 -**********************************************************************************************/ - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - lxv vs2, (\OffsetA+32)(AO) - lxv vs3, (\OffsetA+48)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,64,8 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs6, (32+\OffsetA)(AO) - lxv vs7, (48+\OffsetA)(AO) - lxv vs0, (64+\OffsetA)(AO) - lxv vs1, (64+16+\OffsetA)(AO) - lxv vs2, (64+32+\OffsetA)(AO) - lxv vs3, (64+48+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x8_2 - /*for load2 offset will be 128 and 16*/ - KERNEL1x8_2 AO,BO, 128,16,0 ,1,1 -.endm - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP16(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs6,vs8 - xvmaddasp vs35, vs7,vs8 - xvmaddasp vs42, vs6,vs10 - xvmaddasp vs43, vs7,vs10 -.if \Complete==0 - lxv vs6, DISP16(\Index,32+\OffsetA)(\AREG) - lxv vs7, DISP16(\Index,48+\OffsetA)(\AREG) -.endif -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(\AREG) - lxv vs1, DISP16(\Index,64+16+\OffsetA)(\AREG) -.endif - - xvmaddasp vs34, vs2,vs24 - xvmaddasp vs35, vs3,vs24 - xvmaddasp vs42, vs2,vs26 - xvmaddasp vs43, vs3,vs26 -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \Complete==0 - lxv vs2, DISP16(\Index,64+32+\OffsetA)(\AREG) - lxv vs3, DISP16(\Index,64+48+\OffsetA)(\AREG) -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP16(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP16(\Index,128) -.endif - -.endif -.endm - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 64,8 -.endm - - -.macro SAVE1x8 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask -#ifndef TRMMKERNEL - lxv vs26 , 32(CO) - lxv vs27 , 48(CO) -#endif - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - xxperm vs2,vs34,permute_mask - xxperm vs6,vs42,permute_mask - xxperm vs3,vs35,permute_mask - xxperm vs7,vs43,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - AGGREGATE_REALS_IMAGES vs34,vs2,vs42,vs6 - AGGREGATE_REALS_IMAGES vs35,vs3,vs43,vs7 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART1 vs34,vs42,vs4,vs5 - MULT_APLHA_PART1 vs35,vs43,vs6,vs7 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs34,vs42,vs4,vs5 - MULT_APLHA_PART2 vs35,vs43,vs6,vs7 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 - xxperm vs4,vs5, vs28 - xxperm vs6,vs7, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - xvaddsp vs26,vs26,vs4 - xvaddsp vs27,vs27,vs6 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) - stxv vs26 , 32(CO) - stxv vs27 , 48(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) - stxv vs4 , 32(CO) - stxv vs6 , 48(CO) -#endif - addi CO, CO, 64 -.endm - -/* macros for N=1 and M=4 -**********************************************************************************************/ - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - lxv vs1, (\OffsetA+16)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,32,8 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs5, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - lxv vs0, (32+\OffsetA)(AO) - lxv vs1, (32+16+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x4_2 - /*for load2 offset will be 64 and 16*/ - KERNEL1x4_2 AO,BO, 64,16,0 ,1,1 -.endm - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs33, vs5,vs8 - xvmaddasp vs40, vs4,vs10 - xvmaddasp vs41, vs5,vs10 -.if \Complete==0 - lxv vs4, DISP8(\Index,0+\OffsetA)(\AREG) - lxv vs5, DISP8(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs33, vs1,vs24 - xvmaddasp vs40, vs0,vs26 - xvmaddasp vs41, vs1,vs26 -.if \Complete==0 - lxv vs0, DISP8(\Index,32+\OffsetA)(\AREG) - lxv vs1, DISP8(\Index,32+16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP8(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP8(\Index,64) -.endif - -.endif -.endm - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 32,8 -.endm - - -.macro SAVE1x4 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) - lxv vs25 , 16(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - xxperm vs1,vs33,permute_mask - xxperm vs5,vs41,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - AGGREGATE_REALS_IMAGES vs33,vs1,vs41,vs5 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART1 vs33,vs41,vs2,vs3 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs33,vs41,vs2,vs3 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 - xxperm vs2,vs3, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - xvaddsp vs25,vs25,vs2 - stxv vs24 , 0(CO) - stxv vs25 , 16(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) - stxv vs2 , 16(CO) -#endif - addi CO, CO, 32 -.endm - -/* macros for N=1 and M=2 -**********************************************************************************************/ - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxsd vs4, (\OffsetB+0)(BO) - lxv vs0, (\OffsetA+0)(AO) - xxspltd vs24,vs36,0 - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,16,8 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs27, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - lxv vs0, (16+\OffsetA)(AO) - xxspltd vs8,vs27,1 - xxspltd vs24,vs27,0 - xxperm vs10, vs8, permute_mask - xxperm vs26, vs24, permute_mask -.endm - - -.macro END1x2_2 - /*for load2 offset will be 32 and 16*/ - KERNEL1x2_2 AO,BO, 32,16,0 ,1,1 -.endm - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete -.if \Complete==0 - lxv vs27, DISP2(\Index,\OffsetB)(\BREG) -.endif - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs4, DISP4(\Index,0+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs8,vs27,1 - xxperm vs10, vs8, permute_mask -.endif - xvmaddasp vs32, vs0,vs24 - xvmaddasp vs40, vs0,vs26 -.if \Complete==0 - lxv vs0, DISP4(\Index,16+\OffsetA)(\AREG) -.endif - -.if \Complete==0 - xxspltd vs24,vs27,0 - xxperm vs26, vs24, permute_mask -.endif -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP4(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP4(\Index,32) -.endif - -.endif -.endm - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 16,8 -.endm - - -.macro SAVE1x2 -#ifndef TRMMKERNEL - lxv vs24 , 0(CO) -#endif - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs0,vs1 - MULT_APLHA_PART2 vs32,vs40,vs0,vs1 -/* reconstruct r,i pairs*/ - xxperm vs0,vs1, vs28 -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs24,vs24,vs0 - stxv vs24 , 0(CO) -#else -/* reconstruct r,i pairs*/ - stxv vs0 , 0(CO) -#endif - addi CO, CO, 16 -.endm - -/* macros for N=1 and M=1 -**********************************************************************************************/ -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs40, vs40, vs40 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxsd v4, (\OffsetB+0)(BO) - lxsd v5, (\OffsetA+0)(AO) - xxperm vs38, vs36, permute_mask -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,8,8 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif - -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - - xvmaddasp vs32, vs37,vs36 - xvmaddasp vs40, vs37,vs38 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs8, (\OffsetB)(BO) - lxv vs4, (0+\OffsetA)(AO) - xxperm vs10, vs8, permute_mask -.endm - - -.macro END1x1_2 - /*for load2 offset will be 16 and 16*/ - KERNEL1x1_2 AO,BO, 16,16,0 ,1,1 -.endm - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - - xvmaddasp vs32, vs4,vs8 - xvmaddasp vs40, vs4,vs10 -.if \Complete==0 - lxv vs8, DISP2(\Index,\OffsetB)(\BREG) - lxv vs4, DISP2(\Index,\OffsetB)(\AREG) - xxperm vs10, vs8, permute_mask -.endif - -.if \IsLast==1 -.if \Complete==1 - addi \BREG, \BREG, DISP2(\Index,\OffsetB) - addi \AREG, \AREG, DISP2(\Index,\OffsetA) -.else - addi \BREG, \BREG, DISP2(\Index,16) - addi \AREG, \AREG, DISP2(\Index,16) -.endif - -.endif -.endm - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 8,8 -.endm - - -.macro SAVE1x1 -#ifndef TRMMKERNEL - lxsd v4 , 0(CO) -#endif - /*aggregate x2*/ - xxpermdi vs33,vs32,vs32,2 - xxpermdi vs41,vs40,vs40,2 - xvaddsp vs32,vs32,vs33 - xvaddsp vs40,vs40,vs41 - - xxperm vs0,vs32,permute_mask - xxperm vs4,vs40,permute_mask - AGGREGATE_REALS_IMAGES vs32,vs0,vs40,vs4 - /*inner reverse save_permute and store vs28 */ - xxpermdi vs28,save_permute_1,save_permute_1,2 - /*VSINRR,VSINII,VSOUT1,VSOUT2*/ - MULT_APLHA_PART1 vs32,vs40,vs37,vs1 - MULT_APLHA_PART2 vs32,vs40,vs37,vs1 - -/* reconstruct r,i pairs*/ - xxperm vs37,vs1, vs28 - -#ifndef TRMMKERNEL - /* add */ - xvaddsp vs36,vs36,vs37 - stxsd v4 , 0(CO) -#else - -/* vs37 is v5 */ - stxsd v5 , 0(CO) -#endif - addi CO, CO, 8 -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*8; -// ptrbb = bb + off*4; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+8; // number of values in A -// #else -// temp = off+4; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 8; // number of values in A -// #else -// temp -= 4; // number of values in B -// #endif -// ptrba += temp*8; -// ptrbb += temp*4; -// #endif - -// #ifdef LEFT -// off += 8; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif -.endm \ No newline at end of file diff --git a/kernel/power/cgemv_n.c b/kernel/power/cgemv_n.c deleted file mode 100644 index cb01e196e..000000000 --- a/kernel/power/cgemv_n.c +++ /dev/null @@ -1,585 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - -#include -#include -#include "common.h" -#include -#define NBMAX 1024 - - -static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; - register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; - register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; - register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; - register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; - register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; - register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; - register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; - register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; -#endif - register __vector float *vy = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - register __vector float *vptr_a2 = (__vector float *) a2; - register __vector float *vptr_a3 = (__vector float *) a3; - BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va1 = vptr_a1[i]; - register __vector float va2 = vptr_a2[i]; - register __vector float va3 = vptr_a3[i]; - register __vector float va0_1 = vptr_a0[i + 1]; - register __vector float va1_1 = vptr_a1[i + 1]; - register __vector float va2_1 = vptr_a2[i + 1]; - register __vector float va3_1 = vptr_a3[i + 1]; - - vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; - va0 = vec_perm(va0, va0,swap_mask); - va0_1 = vec_perm(va0_1, va0_1,swap_mask); - va1 = vec_perm(va1, va1,swap_mask); - va1_1 = vec_perm(va1_1, va1_1,swap_mask); - va2 = vec_perm(va2, va2,swap_mask); - va2_1 = vec_perm(va2_1, va2_1,swap_mask); - va3 = vec_perm(va3, va3,swap_mask); - va3_1 = vec_perm(va3_1, va3_1,swap_mask); - vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; - vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - } - -} - - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { - - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; - register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; - register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; - register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; - register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; -#endif - register __vector float *vy = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) a0; - register __vector float *vptr_a1 = (__vector float *) a1; - BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va1 = vptr_a1[i]; - register __vector float va0_1 = vptr_a0[i + 1]; - register __vector float va1_1 = vptr_a1[i + 1]; - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - register __vector float va1x = vec_perm(va1, va1,swap_mask); - register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); - vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; - vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - } - -} - - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; - register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; -#else - register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; - register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; -#endif - register __vector float *vy = (__vector float *) y; - register __vector float *vptr_a0 = (__vector float *) ap; - BLASLONG i = 0; - for (;i< n / 2; i+=2) { - register __vector float vy_0 = vy[i]; - register __vector float vy_1 = vy[i + 1]; - register __vector float va0 = vptr_a0[i]; - register __vector float va0_1 = vptr_a0[i + 1]; - register __vector float va0x = vec_perm(va0, va0,swap_mask); - register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - - vy[i] = vy_0; - vy[i + 1] = vy_1; - } -} - - - - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - - - if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for ( i=0; i -static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; - -static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; - vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; - vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; - vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; - vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; - vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; - vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - - register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; - register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; - - register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; - register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; - y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; - y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; - y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; - y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; - y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; - -#endif - -} - - -static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; - vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; - vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; - - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - - register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; - register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; - y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; - y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; - -#endif - -} - - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); - //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) - register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; - register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; - __vector float* va0 = (__vector float*) ap; - __vector float* v_x = (__vector float*) x; - - for (i = 0; i < n / 2; i+=2) { - register __vector float vx_0 = v_x[i]; - register __vector float vx_1 = v_x[i+1]; - register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); - register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); - - vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; - vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; - - } - -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; - -#else - register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; - register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; - -#endif - -#if !defined(XCONJ) - - y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - -#else - - y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - -#endif - - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 2; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda << 1; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); - a_ptr += lda; - y_ptr += 2; - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - - a_ptr += lda << 2; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof (ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); - -} - diff --git a/kernel/power/crot.c b/kernel/power/crot.c index 959a9eda0..40e350ba3 100644 --- a/kernel/power/crot.c +++ b/kernel/power/crot.c @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) static void crot_kernel_8 (long n, float *x, float *y, float c, float s) { diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 31e02fe5a..da97c896e 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "cswap_microk_power8.c" #endif diff --git a/kernel/power/ctrmm_kernel_8x4_power8.S b/kernel/power/ctrmm_kernel_8x4_power8.S index 822420dfd..26f49c663 100644 --- a/kernel/power/ctrmm_kernel_8x4_power8.S +++ b/kernel/power/ctrmm_kernel_8x4_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -264,7 +264,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfs f2, ALPHA_I_SP // stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -285,7 +285,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index d0e060977..73962c2f2 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "dasum_microk_power8.c" #endif diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c index f09611ff0..df0572e8e 100644 --- a/kernel/power/daxpy.c +++ b/kernel/power/daxpy.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "daxpy_microk_power8.c" #endif diff --git a/kernel/power/dcopy.c b/kernel/power/dcopy.c index 27b39144b..059c0e5a9 100644 --- a/kernel/power/dcopy.c +++ b/kernel/power/dcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "dcopy_microk_power8.c" #endif diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c index f985df1c5..e43470e23 100644 --- a/kernel/power/ddot.c +++ b/kernel/power/ddot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "ddot_microk_power8.c" #endif diff --git a/kernel/power/dgemm_kernel_16x4_power8.S b/kernel/power/dgemm_kernel_16x4_power8.S index 651fd53fc..41958eab0 100644 --- a/kernel/power/dgemm_kernel_16x4_power8.S +++ b/kernel/power/dgemm_kernel_16x4_power8.S @@ -97,7 +97,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ li r11,0 slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dgemm_kernel_power9.S b/kernel/power/dgemm_kernel_power9.S deleted file mode 100644 index 2fb1b27ef..000000000 --- a/kernel/power/dgemm_kernel_power9.S +++ /dev/null @@ -1,249 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - - -#define LOAD ld - - - - -#define STACKSIZE (512 ) -#define ALPHA_SP (296+192)(SP) -#define FZERO (304+192)(SP) - - - -#define M r3 -#define N r4 -#define K r5 - -#define A r7 -#define B r8 -#define C r9 -#define LDC r10 -#define OFFSET r6 - - - -#define alpha_r vs18 - -#define o0 0 - - -#define T4 r12 -#define T3 r11 -#define C4 r14 -#define o8 r15 -#define o24 r16 -#define C2 r17 -#define L r18 -#define T1 r19 -#define C3 r20 -#define TEMP_REG r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define o16 r27 -#define o32 r28 -#define o48 r29 - -#define PRE r30 -#define T2 r31 - -#include "dgemm_macros_power9.S" - - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - - stfd f1, ALPHA_SP - stw r0, FZERO - - slwi LDC, LDC, BASE_SHIFT - -#if defined(TRMMKERNEL) - ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) -#endif - - - cmpwi cr0, M, 0 - ble .L999_H1 - cmpwi cr0, N, 0 - ble .L999_H1 - cmpwi cr0, K, 0 - ble .L999_H1 - - - - addi T1, SP, 296+192 - - - li PRE, 384 - li o8 , 8 - li o16, 16 - li o24, 24 - li o32, 32 - li o48, 48 - - - lxvdsx alpha_r, 0, T1 - -#include "dgemm_logic_power9.S" - -.L999: - addi r3, 0, 0 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif diff --git a/kernel/power/dgemm_logic_power9.S b/kernel/power/dgemm_logic_power9.S deleted file mode 100644 index 251839d19..000000000 --- a/kernel/power/dgemm_logic_power9.S +++ /dev/null @@ -1,1981 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019 The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -#define MY_ALIGN .align 3 - -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - - srawi. J, N, 2 - ble LDGEMM_L4_END - -LDGEMM_L4_BEGIN: - - - li T1, 128 - li T2, 256 - - mr AO, A - mr CO, C - slwi T3, LDC , 2 - add C, C, T3 - - - dcbt A, T1 - dcbt A, T2 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 4 - ble LDGEMM_L4x16_END - - MY_ALIGN -LDGEMM_L4x16_BEGIN: - - li L, -128 - - - SAVE4x16_REGS - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,4 -#else - mr BO, B -#endif - - and T1, CO, L - and T2, C2, L - and T3, C3, L - and T4, C4, L - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - - - addi T1, T1, 128 - addi T2, T2, 128 - addi T3, T3, 128 - addi T4, T4, 128 - - dcbt T1, r0 - dcbt T2, r0 - dcbt T3, r0 - dcbt T4, r0 - -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T3,K,TEMP_REG,16,4 - srawi. L, T3, 5 -#else - srawi. L, K, 5 -#endif - - ble LDGEMM_L4x16_SUB0 - - - MY_ALIGN -LDGEMM_L4x16_LOOP_START: - - li T2, 512 - - - LOAD4x16_1 - ##OffsetA=128 OffsetB=32 - addi AO,AO,2176 - # addi BO,BO,32 - addic. L, L, -1 - - ble LDGEMM_L4x16_LOOP_END - - - mtctr L - - MY_ALIGN - -LDGEMM_L4x16_LOOP: - - #dcbt AO, PRE - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_2 -2048,32, 15,1 - - - bdnz LDGEMM_L4x16_LOOP - - MY_ALIGN - MY_ALIGN -LDGEMM_L4x16_LOOP_END: - - KERNEL4x16_I1_L2_2 -2048,32, 0,0 - KERNEL4x16_I1_L2_2 -2048,32, 1,0 - KERNEL4x16_I1_L2_2 -2048,32, 2,0 - KERNEL4x16_I1_L2_2 -2048,32, 3,0 - KERNEL4x16_I1_L2_2 -2048,32, 4,0 - KERNEL4x16_I1_L2_2 -2048,32, 5,0 - KERNEL4x16_I1_L2_2 -2048,32, 6,0 - KERNEL4x16_I1_L2_2 -2048,32, 7,0 - KERNEL4x16_I1_L2_2 -2048,32, 8,0 - KERNEL4x16_I1_L2_2 -2048,32, 9,0 - KERNEL4x16_I1_L2_2 -2048,32, 10,0 - KERNEL4x16_I1_L2_2 -2048,32, 11,0 - KERNEL4x16_I1_L2_2 -2048,32, 12,0 - KERNEL4x16_I1_L2_2 -2048,32, 13,0 - KERNEL4x16_I1_L2_2 -2048,32, 14,0 - KERNEL4x16_I1_L2_3 -2048,32, 15,1 - b LDGEMM_L4x16_SUB1 - - - MY_ALIGN -LDGEMM_L4x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - KERNEL4x16 1 - - addic. L, L, -1 - ble LDGEMM_L4x16_SAVE - b LDGEMM_L4x16_SUB2 - MY_ALIGN -LDGEMM_L4x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 31 -#else - andi. L, K, 31 -#endif - ble LDGEMM_L4x16_SAVE - MY_ALIGN -LDGEMM_L4x16_SUB2: - - andi. T1,L, 16 - ble LDGEMM_L4x16_SUB2_8 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_2 128,32, 3,0 - KERNEL4x16_I1_L2_2 128,32, 4,0 - KERNEL4x16_I1_L2_2 128,32, 5,0 - KERNEL4x16_I1_L2_2 128,32, 6,0 - KERNEL4x16_I1_L2_3 128,32, 7,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_8: - andi. T1,L, 8 - ble LDGEMM_L4x16_SUB2_4 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_2 128,32, 1,0 - KERNEL4x16_I1_L2_2 128,32, 2,0 - KERNEL4x16_I1_L2_3 128,32, 3,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x16_SUB2_2 - LOAD4x16_0 - KERNEL4x16_I1_L2_2 128,32, 0,0 - KERNEL4x16_I1_L2_3 128,32, 1,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x16_SUB2_1 - LOAD4x16_0 - KERNEL4x16_I1_L2_3 128,32, 0,1 - MY_ALIGN -LDGEMM_L4x16_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x16_SAVE - KERNEL4x16 0 -# addic. L, L, -1 -# bgt LDGEMM_L4x16_SUB2 - - MY_ALIGN -LDGEMM_L4x16_SAVE: - SAVE4x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,4 -#endif - addic. I, I, -1 - bgt+ LDGEMM_L4x16_BEGIN - -LDGEMM_L4x16_END: - -LDGEMM_L4x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L4x1_END - - andi. T1, M, 8 - ble LDGEMM_L4x8_END - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,4 - srawi. L, T3, 4 -#else - mr BO, B - srawi. L, K, 4 -#endif - - - ble LDGEMM_L4x8_SUB0 - -LDGEMM_L4x8_LOOP_START: - - - LOAD4x8_1 - ##OffsetA=64 OffsetB=32 - - - addic. L, L, -1 - - ble LDGEMM_L4x8_LOOP_END - - mtctr L - MY_ALIGN - -LDGEMM_L4x8_LOOP: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_2 64,32, 7,1 - - bdnz LDGEMM_L4x8_LOOP - MY_ALIGN -LDGEMM_L4x8_LOOP_END: - - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_2 64,32, 3,0 - KERNEL4x8_I1_L2_2 64,32, 4,0 - KERNEL4x8_I1_L2_2 64,32, 5,0 - KERNEL4x8_I1_L2_2 64,32, 6,0 - KERNEL4x8_I1_L2_3 64,32, 7,1 - - b LDGEMM_L4x8_SUB1 - MY_ALIGN -LDGEMM_L4x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - KERNEL4x8 1 - - addic. L, L, -1 - ble LDGEMM_L4x8_SAVE - b LDGEMM_L4x8_SUB2 - MY_ALIGN -LDGEMM_L4x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 15 -#else - andi. L, K, 15 -#endif - ble LDGEMM_L4x8_SAVE - MY_ALIGN -LDGEMM_L4x8_SUB2: - - andi. T1,L, 8 - ble LDGEMM_L4x8_SUB2_4 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_2 64,32, 1,0 - KERNEL4x8_I1_L2_2 64,32, 2,0 - KERNEL4x8_I1_L2_3 64,32, 3,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_4: - andi. T1,L, 4 - ble LDGEMM_L4x8_SUB2_2 - LOAD4x8_0 - KERNEL4x8_I1_L2_2 64,32, 0,0 - KERNEL4x8_I1_L2_3 64,32, 1,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_2: - andi. T1,L, 2 - ble LDGEMM_L4x8_SUB2_1 - LOAD4x8_0 - KERNEL4x8_I1_L2_3 64,32, 0,1 - MY_ALIGN -LDGEMM_L4x8_SUB2_1: - andi. T1,L, 1 - ble LDGEMM_L4x8_SAVE - KERNEL4x8 0 - - MY_ALIGN -LDGEMM_L4x8_SAVE: - SAVE4x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,4 -#endif -LDGEMM_L4x8_END: - -LDGEMM_L4x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L4x4_END - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x4_SUB4 - -LDGEMM_L4x4_LOOP_START: - - #dcbt AO, PRE - LOAD4x4_1 - KERNEL4x4_I1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -2 - ble LDGEMM_L4x4_LOOP_END - - MY_ALIGN - -LDGEMM_L4x4_LOOP: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - #dcbt AO, PRE - KERNEL4x4_2 - - addic. L, L, -1 - bgt LDGEMM_L4x4_LOOP - -LDGEMM_L4x4_LOOP_END: - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_2 - - KERNEL4x4_1 - KERNEL4x4_2 - KERNEL4x4_1 - KERNEL4x4_E2 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB4: - - KERNEL4x4_SUBI1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - KERNEL4x4_SUB1 - - b LDGEMM_L4x4_SUB1 - -LDGEMM_L4x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x4_SAVE - b LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x4_SAVE - -LDGEMM_L4x4_SUB2: - - KERNEL4x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x4_SUB2 - -LDGEMM_L4x4_SAVE: - - SAVE4x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,4 -#endif -LDGEMM_L4x4_END: - -LDGEMM_L4x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L4x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x2_SUB4 - -LDGEMM_L4x2_LOOP_START: - - LOAD4x2_1 - KERNEL4x2_I1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -2 - ble LDGEMM_L4x2_LOOP_END - - MY_ALIGN - -LDGEMM_L4x2_LOOP: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - addic. L, L, -1 - bgt LDGEMM_L4x2_LOOP - -LDGEMM_L4x2_LOOP_END: - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_2 - - KERNEL4x2_1 - KERNEL4x2_2 - KERNEL4x2_1 - KERNEL4x2_E2 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB4: - - KERNEL4x2_SUBI1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - KERNEL4x2_SUB1 - - b LDGEMM_L4x2_SUB1 - -LDGEMM_L4x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x2_SAVE - b LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x2_SAVE - -LDGEMM_L4x2_SUB2: - - KERNEL4x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x2_SUB2 - -LDGEMM_L4x2_SAVE: - - SAVE4x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,4 -#endif -LDGEMM_L4x2_END: - -LDGEMM_L4x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L4x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,4 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,4 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L4x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L4x1_SUB4 - -LDGEMM_L4x1_LOOP_START: - - LOAD4x1_1 - KERNEL4x1_I1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -2 - ble LDGEMM_L4x1_LOOP_END - - MY_ALIGN - -LDGEMM_L4x1_LOOP: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - addic. L, L, -1 - bgt LDGEMM_L4x1_LOOP - -LDGEMM_L4x1_LOOP_END: - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_2 - - KERNEL4x1_1 - KERNEL4x1_2 - KERNEL4x1_1 - KERNEL4x1_E2 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB4: - - KERNEL4x1_SUBI1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - KERNEL4x1_SUB1 - - b LDGEMM_L4x1_SUB1 - -LDGEMM_L4x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL4x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L4x1_SAVE - b LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L4x1_SAVE - -LDGEMM_L4x1_SUB2: - - KERNEL4x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L4x1_SUB2 - -LDGEMM_L4x1_SAVE: - - SAVE4x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,4 -#endif -LDGEMM_L4x1_END: - - slwi T1, K, 5 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 4 -#endif - addic. J, J, -1 - bgt LDGEMM_L4_BEGIN - - andi. T2, N, 3 - ble .L999 - -LDGEMM_L4_END: - - b LDGEMM_L2_BEGIN - -.L999_H1: - - b .L999 - -LDGEMM_L2_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 2 - ble LDGEMM_L2_END - mr CO, C - mr AO, A - slwi T1, LDC , 1 - add C, C, T1 - srawi. I, M, 4 - ble LDGEMM_L2x16_END - -LDGEMM_L2x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x16_SUB4 - -LDGEMM_L2x16_LOOP_START: - - #dcbt AO, PRE - LOAD2x16_1 - #dcbt AO, PRE - KERNEL2x16_I1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -2 - ble LDGEMM_L2x16_LOOP_END - - MY_ALIGN - -LDGEMM_L2x16_LOOP: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - addic. L, L, -1 - bgt LDGEMM_L2x16_LOOP - -LDGEMM_L2x16_LOOP_END: - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - - #dcbt AO, PRE - KERNEL2x16_1 - #dcbt AO, PRE - KERNEL2x16_2 - #dcbt AO, PRE - KERNEL2x16_1 - KERNEL2x16_E2 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB4: - - #dcbt AO, PRE - KERNEL2x16_SUBI1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - #dcbt AO, PRE - KERNEL2x16_SUB1 - - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - KERNEL2x16_SUB1 - - b LDGEMM_L2x16_SUB1 - -LDGEMM_L2x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x16_SAVE - b LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x16_SAVE - -LDGEMM_L2x16_SUB2: - - KERNEL2x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x16_SUB2 - -LDGEMM_L2x16_SAVE: - - SAVE2x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,2 -#endif - addic. I, I, -1 - bgt LDGEMM_L2x16_BEGIN - -LDGEMM_L2x16_END: - -LDGEMM_L2x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L2x1_END - - andi. T1, M, 8 - ble LDGEMM_L2x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x8_SUB4 - -LDGEMM_L2x8_LOOP_START: - - #dcbt AO, PRE - LOAD2x8_1 - KERNEL2x8_I1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -2 - ble LDGEMM_L2x8_LOOP_END - - MY_ALIGN - -LDGEMM_L2x8_LOOP: - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - KERNEL2x8_1 - #dcbt AO, PRE - KERNEL2x8_2 - - addic. L, L, -1 - bgt LDGEMM_L2x8_LOOP - -LDGEMM_L2x8_LOOP_END: - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_2 - - KERNEL2x8_1 - KERNEL2x8_2 - KERNEL2x8_1 - KERNEL2x8_E2 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB4: - - KERNEL2x8_SUBI1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - KERNEL2x8_SUB1 - - b LDGEMM_L2x8_SUB1 - -LDGEMM_L2x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x8_SAVE - b LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x8_SAVE - -LDGEMM_L2x8_SUB2: - - KERNEL2x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x8_SUB2 - -LDGEMM_L2x8_SAVE: - - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,2 -#endif -LDGEMM_L2x8_END: - -LDGEMM_L2x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x4_SUB4 - -LDGEMM_L2x4_LOOP_START: - - LOAD2x4_1 - KERNEL2x4_I1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -2 - ble LDGEMM_L2x4_LOOP_END - - MY_ALIGN - -LDGEMM_L2x4_LOOP: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - addic. L, L, -1 - bgt LDGEMM_L2x4_LOOP - -LDGEMM_L2x4_LOOP_END: - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_2 - - KERNEL2x4_1 - KERNEL2x4_2 - KERNEL2x4_1 - KERNEL2x4_E2 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB4: - - KERNEL2x4_SUBI1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - KERNEL2x4_SUB1 - - b LDGEMM_L2x4_SUB1 - -LDGEMM_L2x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x4_SAVE - b LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x4_SAVE - -LDGEMM_L2x4_SUB2: - - KERNEL2x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x4_SUB2 - -LDGEMM_L2x4_SAVE: - - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,2 -#endif -LDGEMM_L2x4_END: - -LDGEMM_L2x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x2_SUB4 - -LDGEMM_L2x2_LOOP_START: - - LOAD2x2_1 - KERNEL2x2_I1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -2 - ble LDGEMM_L2x2_LOOP_END - - MY_ALIGN - -LDGEMM_L2x2_LOOP: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - addic. L, L, -1 - bgt LDGEMM_L2x2_LOOP - -LDGEMM_L2x2_LOOP_END: - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_2 - - KERNEL2x2_1 - KERNEL2x2_2 - KERNEL2x2_1 - KERNEL2x2_E2 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB4: - - KERNEL2x2_SUBI1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - KERNEL2x2_SUB1 - - b LDGEMM_L2x2_SUB1 - -LDGEMM_L2x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x2_SAVE - b LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x2_SAVE - -LDGEMM_L2x2_SUB2: - - KERNEL2x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x2_SUB2 - -LDGEMM_L2x2_SAVE: - - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,2 -#endif -LDGEMM_L2x2_END: - -LDGEMM_L2x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,2 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L2x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L2x1_SUB4 - -LDGEMM_L2x1_LOOP_START: - - LOAD2x1_1 - KERNEL2x1_I1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -2 - ble LDGEMM_L2x1_LOOP_END - - MY_ALIGN - -LDGEMM_L2x1_LOOP: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - addic. L, L, -1 - bgt LDGEMM_L2x1_LOOP - -LDGEMM_L2x1_LOOP_END: - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_2 - - KERNEL2x1_1 - KERNEL2x1_2 - KERNEL2x1_1 - KERNEL2x1_E2 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB4: - - KERNEL2x1_SUBI1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - KERNEL2x1_SUB1 - - b LDGEMM_L2x1_SUB1 - -LDGEMM_L2x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL2x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L2x1_SAVE - b LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L2x1_SAVE - -LDGEMM_L2x1_SUB2: - - KERNEL2x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L2x1_SUB2 - -LDGEMM_L2x1_SAVE: - - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,2 -#endif -LDGEMM_L2x1_END: - - slwi T1, K, 4 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif -LDGEMM_L2_END: -LDGEMM_L1_BEGIN: - -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - andi. T1, N, 1 - ble LDGEMM_L1_END - mr CO, C - mr AO, A - srawi. I, M, 4 - ble LDGEMM_L1x16_END - -LDGEMM_L1x16_BEGIN: - - -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,16,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,16,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x16_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x16_SUB4 - -LDGEMM_L1x16_LOOP_START: - - #dcbt AO, PRE - LOAD1x16_1 - #dcbt AO, PRE - KERNEL1x16_I1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -2 - ble LDGEMM_L1x16_LOOP_END - - MY_ALIGN - -LDGEMM_L1x16_LOOP: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - addic. L, L, -1 - bgt LDGEMM_L1x16_LOOP - -LDGEMM_L1x16_LOOP_END: - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - - #dcbt AO, PRE - KERNEL1x16_1 - #dcbt AO, PRE - KERNEL1x16_2 - #dcbt AO, PRE - KERNEL1x16_1 - KERNEL1x16_E2 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB4: - - #dcbt AO, PRE - KERNEL1x16_SUBI1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - #dcbt AO, PRE - KERNEL1x16_SUB1 - - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - KERNEL1x16_SUB1 - - b LDGEMM_L1x16_SUB1 - -LDGEMM_L1x16_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x16_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x16_SAVE - b LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x16_SAVE - -LDGEMM_L1x16_SUB2: - - KERNEL1x16_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x16_SUB2 - -LDGEMM_L1x16_SAVE: - - SAVE1x16 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,16,1 -#endif - addic. I, I, -1 - bgt LDGEMM_L1x16_BEGIN - -LDGEMM_L1x16_END: - -LDGEMM_L1x8_BEGIN: - - andi. T2, M, 15 - ble LDGEMM_L1x1_END - - andi. T1, M, 8 - ble LDGEMM_L1x8_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,8,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x8_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x8_SUB4 - -LDGEMM_L1x8_LOOP_START: - - #dcbt AO, PRE - LOAD1x8_1 - KERNEL1x8_I1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -2 - ble LDGEMM_L1x8_LOOP_END - - MY_ALIGN - -LDGEMM_L1x8_LOOP: - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - KERNEL1x8_1 - #dcbt AO, PRE - KERNEL1x8_2 - - addic. L, L, -1 - bgt LDGEMM_L1x8_LOOP - -LDGEMM_L1x8_LOOP_END: - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_2 - - KERNEL1x8_1 - KERNEL1x8_2 - KERNEL1x8_1 - KERNEL1x8_E2 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB4: - - KERNEL1x8_SUBI1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - KERNEL1x8_SUB1 - - b LDGEMM_L1x8_SUB1 - -LDGEMM_L1x8_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x8_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x8_SAVE - b LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x8_SAVE - -LDGEMM_L1x8_SUB2: - - KERNEL1x8_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x8_SUB2 - -LDGEMM_L1x8_SAVE: - - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,8,1 -#endif -LDGEMM_L1x8_END: - -LDGEMM_L1x4_BEGIN: - - - andi. T1, M, 4 - ble LDGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,4,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x4_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x4_SUB4 - -LDGEMM_L1x4_LOOP_START: - - LOAD1x4_1 - KERNEL1x4_I1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -2 - ble LDGEMM_L1x4_LOOP_END - - MY_ALIGN - -LDGEMM_L1x4_LOOP: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - addic. L, L, -1 - bgt LDGEMM_L1x4_LOOP - -LDGEMM_L1x4_LOOP_END: - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_2 - - KERNEL1x4_1 - KERNEL1x4_2 - KERNEL1x4_1 - KERNEL1x4_E2 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB4: - - KERNEL1x4_SUBI1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - KERNEL1x4_SUB1 - - b LDGEMM_L1x4_SUB1 - -LDGEMM_L1x4_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x4_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x4_SAVE - b LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x4_SAVE - -LDGEMM_L1x4_SUB2: - - KERNEL1x4_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x4_SUB2 - -LDGEMM_L1x4_SAVE: - - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,4,1 -#endif -LDGEMM_L1x4_END: - -LDGEMM_L1x2_BEGIN: - - - andi. T1, M, 2 - ble LDGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,2,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x2_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x2_SUB4 - -LDGEMM_L1x2_LOOP_START: - - LOAD1x2_1 - KERNEL1x2_I1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -2 - ble LDGEMM_L1x2_LOOP_END - - MY_ALIGN - -LDGEMM_L1x2_LOOP: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - addic. L, L, -1 - bgt LDGEMM_L1x2_LOOP - -LDGEMM_L1x2_LOOP_END: - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_2 - - KERNEL1x2_1 - KERNEL1x2_2 - KERNEL1x2_1 - KERNEL1x2_E2 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB4: - - KERNEL1x2_SUBI1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - KERNEL1x2_SUB1 - - b LDGEMM_L1x2_SUB1 - -LDGEMM_L1x2_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x2_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x2_SAVE - b LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x2_SAVE - -LDGEMM_L1x2_SUB2: - - KERNEL1x2_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x2_SUB2 - -LDGEMM_L1x2_SAVE: - - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,2,1 -#endif -LDGEMM_L1x2_END: - -LDGEMM_L1x1_BEGIN: - - - andi. T1, M, 1 - ble LDGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 - REFRESH_TEMP_BK T3,K,TEMP_REG,1,1 - srawi. L, T3, 3 -#else - mr BO, B - srawi. L, K, 3 -#endif - ble LDGEMM_L1x1_SUB0 - cmpwi cr0, L, 1 - ble LDGEMM_L1x1_SUB4 - -LDGEMM_L1x1_LOOP_START: - - LOAD1x1_1 - KERNEL1x1_I1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -2 - ble LDGEMM_L1x1_LOOP_END - - MY_ALIGN - -LDGEMM_L1x1_LOOP: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - addic. L, L, -1 - bgt LDGEMM_L1x1_LOOP - -LDGEMM_L1x1_LOOP_END: - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_2 - - KERNEL1x1_1 - KERNEL1x1_2 - KERNEL1x1_1 - KERNEL1x1_E2 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB4: - - KERNEL1x1_SUBI1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - KERNEL1x1_SUB1 - - b LDGEMM_L1x1_SUB1 - -LDGEMM_L1x1_SUB0: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - - KERNEL1x1_SUBI1 - - addic. L, L, -1 - ble LDGEMM_L1x1_SAVE - b LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SUB1: -#if defined(TRMMKERNEL) - andi. L, T3, 7 -#else - andi. L, K, 7 -#endif - ble LDGEMM_L1x1_SAVE - -LDGEMM_L1x1_SUB2: - - KERNEL1x1_SUB1 - - addic. L, L, -1 - bgt LDGEMM_L1x1_SUB2 - -LDGEMM_L1x1_SAVE: - - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T3,K,TEMP_REG,BO,AO,1,1 -#endif -LDGEMM_L1x1_END: -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif -LDGEMM_L1_END: diff --git a/kernel/power/dgemm_macros_power9.S b/kernel/power/dgemm_macros_power9.S deleted file mode 100644 index c4b8270b8..000000000 --- a/kernel/power/dgemm_macros_power9.S +++ /dev/null @@ -1,3623 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -/************************************************************************************** -* Abdelrauf(quickwritereader@googlemail.com) -* BLASTEST : OK -* CTEST : OK -* TEST : OK -* LAPACK-TEST : OK -**************************************************************************************/ - -/********************************************************************* -* Macros for N=4, M=16 * -*********************************************************************/ -.macro LOAD4x16_1 - LOAD4x16 1 -.endm - -.macro LOAD4x16_0 - LOAD4x16 0 -.endm -.macro LOAD4x16 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - xxlxor vs36,vs36,vs36 - xxlxor vs37,vs37,vs37 - xxlxor vs38,vs38,vs38 - xxlxor vs39,vs39,vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endif -.endm - - -#define unit_size 8 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) - -.macro KERNEL4x16_L1_L2 Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x16_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I AO,BO, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_I2_L2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG,1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x16_I2_L2_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast - KERNEL4x16_L1_L2_I \AREG,\BREG, 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x16_L1_L2_I AREG,BREG, First, OffsetA,OffsetB, Index,IsLast ,Complete - -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - lxv vs8, DISP32(\Index,0+\OffsetA)(\AREG) - lxv vs9, DISP32(\Index,16+\OffsetA)(\AREG) - lxv vs10, DISP32(\Index,32+\OffsetA)(\AREG) - lxv vs11, DISP32(\Index,48+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 -.else - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 -.endif - lxv vs28, DISP8(\Index,0 +\OffsetB)(\BREG) - lxv vs30, DISP8(\Index,16 +\OffsetB)(\BREG) - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - lxv vs12, DISP32(\Index,64+\OffsetA)(\AREG) - lxv vs13, DISP32(\Index,80+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - -.else - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 -.endif - lxv vs14, DISP32(\Index,96+\OffsetA)(\AREG) - lxv vs15, DISP32(\Index,112+\OffsetA)(\AREG) -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - - - - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - - - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP32(\Index,128+\OffsetA)(\AREG) - lxv vs1, DISP32(\Index,144+\OffsetA)(\AREG) -.endif - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(\BREG) - lxv vs26, DISP8(\Index,48 +\OffsetB)(\BREG) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 -.if \Complete==0 - lxv vs2, DISP32(\Index,160+\OffsetA)(\AREG) - lxv vs3, DISP32(\Index,176+\OffsetA)(\AREG) -.endif - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs4, DISP32(\Index,192+\OffsetA)(\AREG) - lxv vs5, DISP32(\Index,208+\OffsetA)(\AREG) -.endif - xvmaddadp vs52, vs12, vs30 - xvmaddadp vs53, vs13, vs30 - xvmaddadp vs54, vs14, vs30 - xvmaddadp vs55, vs15, vs30 -.if \Complete==0 - lxv vs6, DISP32(\Index,224+\OffsetA)(\AREG) - lxv vs7, DISP32(\Index,240+\OffsetA)(\AREG) -.endif - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 - - - xvmaddadp vs60, vs12, vs31 - - xvmaddadp vs61, vs13, vs31 - xvmaddadp vs62, vs14, vs31 - - xvmaddadp vs63, vs15, vs31 - .if \IsLast==1 - .if \Complete==1 - addi \AREG, \AREG, DISP32(\Index,128+\OffsetA) - addi \BREG, \BREG, DISP8(\Index,32+\OffsetB) - .else - addi \AREG, \AREG, DISP32(\Index,256) - addi \BREG, \BREG, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x16 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - lxv vs4, 64(AO) - lxv vs5, 80(AO) - lxv vs6, 96(AO) - lxv vs7, 112(AO) - - - - addi BO, BO, 32 - addi AO, AO, 128 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - xvmuldp vs52, vs4, vs26 - xvmuldp vs53, vs5, vs26 - xvmuldp vs54, vs6, vs26 - xvmuldp vs55, vs7, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - xvmuldp vs60, vs4, vs27 - xvmuldp vs61, vs5, vs27 - xvmuldp vs62, vs6, vs27 - xvmuldp vs63, vs7, vs27 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - xvmaddadp vs52, vs4, vs26 - xvmaddadp vs53, vs5, vs26 - xvmaddadp vs54, vs6, vs26 - xvmaddadp vs55, vs7, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - xvmaddadp vs60, vs4, vs27 - xvmaddadp vs61, vs5, vs27 - xvmaddadp vs62, vs6, vs27 - xvmaddadp vs63, vs7, vs27 - -.endif -.endm - -.macro SAVE4x16_REGS - add C2, CO, LDC - add C3, C2, LDC - add C4, C3, LDC -.endm - -.macro SAVE4x16 -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs24, 64(CO) - lxv vs26, 80(CO) - lxv vs28, 96(CO) - lxv vs30, 112(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C2) - lxv vs3, 16(C2) - lxv vs5, 32(C2) - lxv vs7, 48(C2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs25, 64(C2) - lxv vs27, 80(C2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 -#ifndef TRMMKERNEL - lxv vs29, 96(C2) - lxv vs31, 112(C2) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - xxpermdi vs8, vs44,vs36,1 - xxpermdi vs9 ,vs36,vs44,1 - xxpermdi vs10, vs45,vs37,1 - xxpermdi vs11 ,vs37,vs45,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - xxpermdi vs12, vs46,vs38,1 - xxpermdi vs13 ,vs38,vs46,1 - xxpermdi vs14, vs47,vs39,1 - xxpermdi vs15 ,vs39,vs47,1 - -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r - -#endif - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - stxv vs24, 64(CO) - stxv vs26, 80(CO) - stxv vs28, 96(CO) - stxv vs30, 112(CO) - - stxv vs1, 0(C2) - stxv vs3, 16(C2) - stxv vs5, 32(C2) - stxv vs7, 48(C2) - - stxv vs25, 64(C2) - stxv vs27, 80(C2) - stxv vs29, 96(C2) - stxv vs31, 112(C2) -#ifndef TRMMKERNEL - lxv vs0, 0(C3) - lxv vs2, 16(C3) - lxv vs4, 32(C3) - lxv vs6, 48(C3) -#endif - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs24, 64(C3) - lxv vs26, 80(C3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs28, 96(C3) - lxv vs30, 112(C3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(C4) - lxv vs3, 16(C4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(C4) - lxv vs7, 48(C4) - - lxv vs25, 64(C4) - lxv vs27, 80(C4) - lxv vs29, 96(C4) - lxv vs31, 112(C4) -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - -#endif - - xxpermdi vs8, vs60,vs52,1 - xxpermdi vs9 ,vs52,vs60,1 - xxpermdi vs10, vs61,vs53,1 - xxpermdi vs11 ,vs53,vs61,1 -#ifndef TRMMKERNEL - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r -#endif - - - xxpermdi vs12, vs62,vs54,1 - xxpermdi vs13 ,vs54,vs62,1 - xxpermdi vs14, vs63,vs55,1 - xxpermdi vs15 ,vs55,vs63,1 -#ifndef TRMMKERNEL - xvmaddadp vs24, vs8, alpha_r - xvmaddadp vs25, vs9, alpha_r - xvmaddadp vs26, vs10, alpha_r - xvmaddadp vs27, vs11, alpha_r - - xvmaddadp vs28, vs12, alpha_r - xvmaddadp vs29, vs13, alpha_r - xvmaddadp vs30, vs14, alpha_r - xvmaddadp vs31, vs15, alpha_r -#else - xvmuldp vs24, vs8, alpha_r - xvmuldp vs25, vs9, alpha_r - xvmuldp vs26, vs10, alpha_r - xvmuldp vs27, vs11, alpha_r - - xvmuldp vs28, vs12, alpha_r - xvmuldp vs29, vs13, alpha_r - xvmuldp vs30, vs14, alpha_r - xvmuldp vs31, vs15, alpha_r -#endif - stxv vs0, 0(C3) - stxv vs2, 16(C3) - stxv vs4, 32(C3) - stxv vs6, 48(C3) - - stxv vs24, 64(C3) - stxv vs26, 80(C3) - stxv vs28, 96(C3) - stxv vs30, 112(C3) - - stxv vs1, 0(C4) - stxv vs3, 16(C4) - stxv vs5, 32(C4) - stxv vs7, 48(C4) - - stxv vs25, 64(C4) - stxv vs27, 80(C4) - stxv vs29, 96(C4) - stxv vs31, 112(C4) - - addi CO, CO, 128 -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD4x8_1 - LOAD4x8 1 -.endm - -.macro LOAD4x8_0 - LOAD4x8 0 -.endm -.macro LOAD4x8 Zero - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - -.if \Zero==1 - xxlxor vs32,vs32,vs32 - xxlxor vs33,vs33,vs33 - xxlxor vs34,vs34,vs34 - xxlxor vs35,vs35,vs35 - - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - -.endif -.endm - - - -.macro KERNEL4x8_L1_L2 Index,IsLast - KERNEL4x8_L1_L2_I 0,0,0, \Index,\IsLast,0 -.endm - - - -.macro KERNEL4x8_I1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 1,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_2 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,0 -.endm - -.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast - KERNEL4x8_L1_L2_I 0,\OffsetA,\OffsetB,\Index,\IsLast,1 -.endm - -.macro KERNEL4x8_L1_L2_I First, OffsetA,OffsetB, Index,IsLast ,Complete - - lxv vs8, DISP16(\Index,0+\OffsetA)(AO) - lxv vs9, DISP16(\Index,16+\OffsetA)(AO) -.if \First ==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 -.endif - - lxv vs10, DISP16(\Index,32+\OffsetA)(AO) - lxv vs11, DISP16(\Index,48+\OffsetA)(AO) - - - -.if \First ==1 - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - -.else - - lxv vs28, DISP8(\Index,0 +\OffsetB)(BO) - lxv vs30, DISP8(\Index,16 +\OffsetB)(BO) - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - -.endif - xxpermdi vs29, vs28, vs28,2 - xxpermdi vs31, vs30, vs30,2 -.if \First ==1 - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - -.endif - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 -.if \Complete==0 - lxv vs0, DISP16(\Index,64+\OffsetA)(AO) - lxv vs1, DISP16(\Index,80+\OffsetA)(AO) -.endif - - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.if \Complete==0 - lxv vs2, DISP16(\Index,96+\OffsetA)(AO) - lxv vs3, DISP16(\Index,112+\OffsetA)(AO) -.endif - - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - xvmaddadp vs50, vs10, vs30 - xvmaddadp vs51, vs11, vs30 -.if \Complete==0 - lxv vs24, DISP8(\Index,32 +\OffsetB)(BO) - lxv vs26, DISP8(\Index,48 +\OffsetB)(BO) -.endif - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - xvmaddadp vs58, vs10, vs31 - xvmaddadp vs59, vs11, vs31 -.if \Complete==0 - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 -.endif - - .if \IsLast==1 - .if \Complete==1 - addi AO, AO, DISP16(\Index,64+\OffsetA) - addi BO, BO, DISP8(\Index,32+\OffsetB) - .else - addi AO, AO, DISP16(\Index,128) - addi BO, BO, DISP8(\Index,64) - .endif - .endif - - -.endm - - - -.macro KERNEL4x8 First - - lxv vs24, 0(BO) - lxv vs26, 16(BO) - xxpermdi vs25, vs24, vs24,2 - xxpermdi vs27, vs26, vs26,2 - - lxv vs0, 0(AO) - lxv vs1, 16(AO) - lxv vs2, 32(AO) - lxv vs3, 48(AO) - - - - - addi BO, BO, 32 - addi AO, AO, 64 - -.if \First==1 - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - xvmuldp vs50, vs2, vs26 - xvmuldp vs51, vs3, vs26 - - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - xvmuldp vs58, vs2, vs27 - xvmuldp vs59, vs3, vs27 - -.else - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - - - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - xvmaddadp vs50, vs2, vs26 - xvmaddadp vs51, vs3, vs26 - - - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - xvmaddadp vs58, vs2, vs27 - xvmaddadp vs59, vs3, vs27 - - -.endif -.endm - - - -.macro SAVE4x8 - add T2, CO, LDC - add T3, T2, LDC - add T4, T3, LDC -#ifndef TRMMKERNEL - lxv vs0, 0(CO) - lxv vs2, 16(CO) -#endif - xxpermdi vs8, vs40,vs32,1 - xxpermdi vs9 ,vs32,vs40,1 -#ifndef TRMMKERNEL - lxv vs4, 32(CO) - lxv vs6, 48(CO) -#endif - xxpermdi vs10, vs41,vs33,1 - xxpermdi vs11 ,vs33,vs41,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T2) - lxv vs3, 16(T2) -#endif - xxpermdi vs12, vs42,vs34,1 - xxpermdi vs13 ,vs34,vs42,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T2) - lxv vs7, 48(T2) -#endif - xxpermdi vs14, vs43,vs35,1 - xxpermdi vs15 ,vs35,vs43,1 - - - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(CO) - stxv vs2, 16(CO) - stxv vs4, 32(CO) - stxv vs6, 48(CO) - - - stxv vs1, 0(T2) - stxv vs3, 16(T2) - stxv vs5, 32(T2) - stxv vs7, 48(T2) - - - xxpermdi vs8, vs56,vs48,1 - xxpermdi vs9 ,vs48,vs56,1 -#ifndef TRMMKERNEL - lxv vs0, 0(T3) - lxv vs2, 16(T3) -#endif - xxpermdi vs10, vs57,vs49,1 - xxpermdi vs11 ,vs49,vs57,1 -#ifndef TRMMKERNEL - lxv vs4, 32(T3) - lxv vs6, 48(T3) -#endif - xxpermdi vs12, vs58,vs50,1 - xxpermdi vs13 ,vs50,vs58,1 -#ifndef TRMMKERNEL - lxv vs1, 0(T4) - lxv vs3, 16(T4) -#endif - xxpermdi vs14, vs59,vs51,1 - xxpermdi vs15 ,vs51,vs59,1 -#ifndef TRMMKERNEL - lxv vs5, 32(T4) - lxv vs7, 48(T4) - - - xvmaddadp vs0, vs8, alpha_r - xvmaddadp vs1, vs9, alpha_r - xvmaddadp vs2, vs10, alpha_r - xvmaddadp vs3, vs11, alpha_r - - - - xvmaddadp vs4, vs12, alpha_r - xvmaddadp vs5, vs13, alpha_r - xvmaddadp vs6, vs14, alpha_r - xvmaddadp vs7, vs15, alpha_r -#else - xvmuldp vs0, vs8, alpha_r - xvmuldp vs1, vs9, alpha_r - xvmuldp vs2, vs10, alpha_r - xvmuldp vs3, vs11, alpha_r - - - - xvmuldp vs4, vs12, alpha_r - xvmuldp vs5, vs13, alpha_r - xvmuldp vs6, vs14, alpha_r - xvmuldp vs7, vs15, alpha_r - -#endif - - - stxv vs0, 0(T3) - stxv vs2, 16(T3) - stxv vs4, 32(T3) - stxv vs6, 48(T3) - - - stxv vs1, 0(T4) - stxv vs3, 16(T4) - stxv vs5, 32(T4) - stxv vs7, 48(T4) - - - - addi CO, CO, 64 -.endm - - -/********************************************************************* -* Macros for N=4, M=4 * -*********************************************************************/ - -.macro LOAD4x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - - xvmaddadp vs48, vs8, vs30 - xvmaddadp vs49, vs9, vs30 - - xvmaddadp vs56, vs8, vs31 - xvmaddadp vs57, vs9, vs31 - -.endm - -.macro KERNEL4x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - - xvmuldp vs48, vs0, vs26 - xvmuldp vs49, vs1, vs26 - - xvmuldp vs56, vs0, vs27 - xvmuldp vs57, vs1, vs27 - -.endm - -.macro KERNEL4x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 32 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - - xvmaddadp vs48, vs0, vs26 - xvmaddadp vs49, vs1, vs26 - - xvmaddadp vs56, vs0, vs27 - xvmaddadp vs57, vs1, vs27 - -.endm - -.macro SAVE4x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r - xvmaddadp vs1, vs49, alpha_r -#else - xvmuldp vs0, vs48, alpha_r - xvmuldp vs1, vs49, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r - xvmaddadp vs9, vs57, alpha_r -#else - xvmuldp vs8, vs56, alpha_r - xvmuldp vs9, vs57, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=4, M=2 * -*********************************************************************/ - -.macro LOAD4x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - lxvdsx vs30, o16, BO - lxvdsx vs31, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - - xvmaddadp vs48, vs8, vs30 - - xvmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - - xvmuldp vs48, vs0, vs26 - - xvmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - lxvdsx vs26, o16, BO - lxvdsx vs27, o24, BO - - addi AO, AO, 16 - addi BO, BO, 32 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - - xvmaddadp vs48, vs0, vs26 - - xvmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs48, alpha_r -#else - xvmuldp vs0, vs48, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs56, alpha_r -#else - xvmuldp vs8, vs56, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=4, M=1 * -*********************************************************************/ - -.macro LOAD4x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - -.endm - -.macro KERNEL4x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - lxsdx vs30, o16, BO - lxsdx vs31, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - - xsmaddadp vs48, vs8, vs30 - - xsmaddadp vs56, vs8, vs31 - -.endm - -.macro KERNEL4x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - - xsmuldp vs48, vs0, vs26 - - xsmuldp vs56, vs0, vs27 - -.endm - -.macro KERNEL4x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - lxsdx vs26, o16, BO - lxsdx vs27, o24, BO - - addi AO, AO, 8 - addi BO, BO, 32 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - - xsmaddadp vs48, vs0, vs26 - - xsmaddadp vs56, vs0, vs27 - -.endm - -.macro SAVE4x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs48, alpha_r -#else - xsmuldp vs0, vs48, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs56, alpha_r -#else - xsmuldp vs8, vs56, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=2, M=16 * -*********************************************************************/ - -.macro LOAD2x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL2x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - xvmaddadp vs44, vs12, vs29 - xvmaddadp vs45, vs13, vs29 - xvmaddadp vs46, vs14, vs29 - xvmaddadp vs47, vs15, vs29 - -.endm - -.macro KERNEL2x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - xvmuldp vs44, vs4, vs25 - xvmuldp vs45, vs5, vs25 - xvmuldp vs46, vs6, vs25 - xvmuldp vs47, vs7, vs25 - -.endm - -.macro KERNEL2x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - xvmaddadp vs44, vs4, vs25 - xvmaddadp vs45, vs5, vs25 - xvmaddadp vs46, vs6, vs25 - xvmaddadp vs47, vs7, vs25 - -.endm - -.macro SAVE2x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - add T1, T1, LDC - add T2, T2, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 - - lxvd2x vs12, 0, T2 - lxvd2x vs13, o16, T2 - lxvd2x vs14, o32, T2 - lxvd2x vs15, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r - xvmaddadp vs12, vs44, alpha_r - xvmaddadp vs13, vs45, alpha_r - xvmaddadp vs14, vs46, alpha_r - xvmaddadp vs15, vs47, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r - xvmuldp vs12, vs44, alpha_r - xvmuldp vs13, vs45, alpha_r - xvmuldp vs14, vs46, alpha_r - xvmuldp vs15, vs47, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - stxvd2x vs12, 0, T2 - stxvd2x vs13, o16, T2 - stxvd2x vs14, o32, T2 - stxvd2x vs15, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD2x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - xvmaddadp vs42, vs10, vs29 - xvmaddadp vs43, vs11, vs29 - -.endm - -.macro KERNEL2x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - xvmuldp vs42, vs2, vs25 - xvmuldp vs43, vs3, vs25 - -.endm - -.macro KERNEL2x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 64 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - xvmaddadp vs42, vs2, vs25 - xvmaddadp vs43, vs3, vs25 - -.endm - -.macro SAVE2x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 - lxvd2x vs10, o32, T1 - lxvd2x vs11, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r - xvmaddadp vs10, vs42, alpha_r - xvmaddadp vs11, vs43, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r - xvmuldp vs10, vs42, alpha_r - xvmuldp vs11, vs43, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - stxvd2x vs10, o32, T1 - stxvd2x vs11, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=2, M=4 * -*********************************************************************/ - -.macro LOAD2x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - - xvmaddadp vs40, vs8, vs29 - xvmaddadp vs41, vs9, vs29 - -.endm - -.macro KERNEL2x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - - xvmuldp vs40, vs0, vs25 - xvmuldp vs41, vs1, vs25 - -.endm - -.macro KERNEL2x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 32 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - - xvmaddadp vs40, vs0, vs25 - xvmaddadp vs41, vs1, vs25 - -.endm - -.macro SAVE2x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 - lxvd2x vs9, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r - xvmaddadp vs9, vs41, alpha_r -#else - xvmuldp vs8, vs40, alpha_r - xvmuldp vs9, vs41, alpha_r -#endif - - stxvd2x vs8, 0, T1 - stxvd2x vs9, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=2, M=2 * -*********************************************************************/ - -.macro LOAD2x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - lxvdsx vs29, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_E2 - - - xvmaddadp vs32, vs8, vs28 - - xvmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmuldp vs32, vs0, vs24 - - xvmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - lxvdsx vs25, o8, BO - - addi AO, AO, 16 - addi BO, BO, 16 - - - xvmaddadp vs32, vs0, vs24 - - xvmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxvd2x vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs8, vs40, alpha_r -#else - xvmuldp vs8, vs40, alpha_r -#endif - - stxvd2x vs8, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=2, M=1 * -*********************************************************************/ - -.macro LOAD2x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - -.endm - -.macro KERNEL2x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - lxsdx vs29, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_E2 - - - xsmaddadp vs32, vs8, vs28 - - xsmaddadp vs40, vs8, vs29 - -.endm - -.macro KERNEL2x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmuldp vs32, vs0, vs24 - - xsmuldp vs40, vs0, vs25 - -.endm - -.macro KERNEL2x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - lxsdx vs25, o8, BO - - addi AO, AO, 8 - addi BO, BO, 16 - - - xsmaddadp vs32, vs0, vs24 - - xsmaddadp vs40, vs0, vs25 - -.endm - -.macro SAVE2x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - add T1, T1, LDC - -#ifndef TRMMKERNEL - lxsdx vs8, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs8, vs40, alpha_r -#else - xsmuldp vs8, vs40, alpha_r -#endif - - stxsdx vs8, 0, T1 - - addi CO, CO, 8 - -.endm - -/********************************************************************* -* Macros for N=1, M=16 * -*********************************************************************/ - -.macro LOAD1x16_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - -.endm - -.macro KERNEL1x16_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs12, 0, AO - lxvd2x vs13, o16, AO - lxvd2x vs14, o32, AO - lxvd2x vs15, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - xvmaddadp vs36, vs12, vs28 - xvmaddadp vs37, vs13, vs28 - xvmaddadp vs38, vs14, vs28 - xvmaddadp vs39, vs15, vs28 - -.endm - -.macro KERNEL1x16_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - xvmuldp vs36, vs4, vs24 - xvmuldp vs37, vs5, vs24 - xvmuldp vs38, vs6, vs24 - xvmuldp vs39, vs7, vs24 - -.endm - -.macro KERNEL1x16_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - lxvd2x vs4, 0, AO - lxvd2x vs5, o16, AO - lxvd2x vs6, o32, AO - lxvd2x vs7, o48, AO - - addi AO, AO, 64 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - xvmaddadp vs36, vs4, vs24 - xvmaddadp vs37, vs5, vs24 - xvmaddadp vs38, vs6, vs24 - xvmaddadp vs39, vs7, vs24 - -.endm - -.macro SAVE1x16 - - mr T1, CO - addi T2, T1, 64 - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 - - lxvd2x vs4, 0, T2 - lxvd2x vs5, o16, T2 - lxvd2x vs6, o32, T2 - lxvd2x vs7, o48, T2 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r - xvmaddadp vs4, vs36, alpha_r - xvmaddadp vs5, vs37, alpha_r - xvmaddadp vs6, vs38, alpha_r - xvmaddadp vs7, vs39, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r - xvmuldp vs4, vs36, alpha_r - xvmuldp vs5, vs37, alpha_r - xvmuldp vs6, vs38, alpha_r - xvmuldp vs7, vs39, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - stxvd2x vs4, 0, T2 - stxvd2x vs5, o16, T2 - stxvd2x vs6, o32, T2 - stxvd2x vs7, o48, T2 - - addi CO, CO, 128 - -.endm - -/********************************************************************* -* Macros for N=4, M=8 * -*********************************************************************/ - -.macro LOAD1x8_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x8_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - lxvd2x vs10, o32, AO - lxvd2x vs11, o48, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - xvmaddadp vs34, vs10, vs28 - xvmaddadp vs35, vs11, vs28 - -.endm - -.macro KERNEL1x8_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - xvmuldp vs34, vs2, vs24 - xvmuldp vs35, vs3, vs24 - -.endm - -.macro KERNEL1x8_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - lxvd2x vs2, o32, AO - lxvd2x vs3, o48, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 64 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - xvmaddadp vs34, vs2, vs24 - xvmaddadp vs35, vs3, vs24 - -.endm - -.macro SAVE1x8 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 - lxvd2x vs2, o32, T1 - lxvd2x vs3, o48, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r - xvmaddadp vs2, vs34, alpha_r - xvmaddadp vs3, vs35, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r - xvmuldp vs2, vs34, alpha_r - xvmuldp vs3, vs35, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - stxvd2x vs2, o32, T1 - stxvd2x vs3, o48, T1 - - addi CO, CO, 64 - -.endm - -/********************************************************************* -* Macros for N=1, M=4 * -*********************************************************************/ - -.macro LOAD1x4_1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x4_I1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_1 - - lxvd2x vs8, 0, AO - lxvd2x vs9, o16, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_2 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_E2 - - - xvmaddadp vs32, vs8, vs28 - xvmaddadp vs33, vs9, vs28 - -.endm - -.macro KERNEL1x4_SUBI1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - xvmuldp vs33, vs1, vs24 - -.endm - -.macro KERNEL1x4_SUB1 - - lxvd2x vs0, 0, AO - lxvd2x vs1, o16, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 32 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - xvmaddadp vs33, vs1, vs24 - -.endm - -.macro SAVE1x4 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 - lxvd2x vs1, o16, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r - xvmaddadp vs1, vs33, alpha_r -#else - xvmuldp vs0, vs32, alpha_r - xvmuldp vs1, vs33, alpha_r -#endif - - stxvd2x vs0, 0, T1 - stxvd2x vs1, o16, T1 - - addi CO, CO, 32 - -.endm - -/********************************************************************* -* Macros for N=1, M=2 * -*********************************************************************/ - -.macro LOAD1x2_1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x2_I1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_1 - - lxvd2x vs8, 0, AO - - lxvdsx vs28, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_2 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_E2 - - - xvmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x2_SUBI1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x2_SUB1 - - lxvd2x vs0, 0, AO - - lxvdsx vs24, 0, BO - - addi AO, AO, 16 - addi BO, BO, 8 - - - xvmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x2 - - mr T1, CO - -#ifndef TRMMKERNEL - lxvd2x vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xvmaddadp vs0, vs32, alpha_r -#else - xvmuldp vs0, vs32, alpha_r -#endif - - stxvd2x vs0, 0, T1 - - addi CO, CO, 16 - -.endm - -/********************************************************************* -* Macros for N=1, M=1 * -*********************************************************************/ - -.macro LOAD1x1_1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - -.endm - -.macro KERNEL1x1_I1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_1 - - lxsdx vs8, 0, AO - - lxsdx vs28, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_2 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_E2 - - - xsmaddadp vs32, vs8, vs28 - -.endm - -.macro KERNEL1x1_SUBI1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmuldp vs32, vs0, vs24 - -.endm - -.macro KERNEL1x1_SUB1 - - lxsdx vs0, 0, AO - - lxsdx vs24, 0, BO - - addi AO, AO, 8 - addi BO, BO, 8 - - - xsmaddadp vs32, vs0, vs24 - -.endm - -.macro SAVE1x1 - - mr T1, CO - -#ifndef TRMMKERNEL - lxsdx vs0, 0, T1 -#endif - -#ifndef TRMMKERNEL - xsmaddadp vs0, vs32, alpha_r -#else - xsmuldp vs0, vs32, alpha_r -#endif - - stxsdx vs0, 0, T1 - - addi CO, CO, 8 - -.endm - - - - -/****************************TRMM POINTER REFRESH MACROSES*************************/ - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 4 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 3 - .endif -.endm - -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif - -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif - -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - - #endif - - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif -.endm \ No newline at end of file diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c index b458e11fc..57f9f9e72 100644 --- a/kernel/power/dgemv_n.c +++ b/kernel/power/dgemv_n.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "dgemv_n_microk_power8.c" #endif diff --git a/kernel/power/dgemv_t.c b/kernel/power/dgemv_t.c index b8589a131..3974ed62d 100644 --- a/kernel/power/dgemv_t.c +++ b/kernel/power/dgemv_t.c @@ -27,8 +27,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#define NBMAX 1024 -//#define PREFETCH 1 +#define NBMAX 8192 +#define PREFETCH 1 #include #define HAVE_KERNEL4x8_ASM 1 diff --git a/kernel/power/drot.c b/kernel/power/drot.c index baeb54205..3e107486f 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "drot_microk_power8.c" #endif diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 779a08e9c..f32dc4bad 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "dscal_microk_power8.c" #endif diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 52b7f50da..fd2dec9c4 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "dswap_microk_power8.c" #endif diff --git a/kernel/power/dtrmm_kernel_16x4_power8.S b/kernel/power/dtrmm_kernel_16x4_power8.S index 84c65f503..47e703a3a 100644 --- a/kernel/power/dtrmm_kernel_16x4_power8.S +++ b/kernel/power/dtrmm_kernel_16x4_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -257,6 +257,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stvx v31, r11, r0 li r11,0 + stw r31, 144(SP) + stfd f1, ALPHA_SP stw r0, FZERO @@ -269,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/dtrsm_kernel_LT_16x4_power8.S b/kernel/power/dtrsm_kernel_LT_16x4_power8.S index 8a423f181..7a4a30390 100644 --- a/kernel/power/dtrsm_kernel_LT_16x4_power8.S +++ b/kernel/power/dtrsm_kernel_LT_16x4_power8.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -217,7 +217,7 @@ li r11,0 #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_beta.S b/kernel/power/gemm_beta.S index 81457b698..62d7761ec 100644 --- a/kernel/power/gemm_beta.S +++ b/kernel/power/gemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 16(SP) stw r0, 24(SP) -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else @@ -129,7 +129,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbtst PRE, CO1 + dcbst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 diff --git a/kernel/power/gemm_kernel.S b/kernel/power/gemm_kernel.S index 37ff9c9e7..e5e9ec346 100644 --- a/kernel/power/gemm_kernel.S +++ b/kernel/power/gemm_kernel.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -186,7 +186,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -228,7 +228,7 @@ #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_altivec.S b/kernel/power/gemm_kernel_altivec.S index 2dae49cb8..6c7e78319 100644 --- a/kernel/power/gemm_kernel_altivec.S +++ b/kernel/power/gemm_kernel_altivec.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_cell.S b/kernel/power/gemm_kernel_altivec_cell.S index 0823420dd..b7445a1f6 100644 --- a/kernel/power/gemm_kernel_altivec_cell.S +++ b/kernel/power/gemm_kernel_altivec_cell.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_altivec_g4.S b/kernel/power/gemm_kernel_altivec_g4.S index 3a214b248..548150143 100644 --- a/kernel/power/gemm_kernel_altivec_g4.S +++ b/kernel/power/gemm_kernel_altivec_g4.S @@ -58,7 +58,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 diff --git a/kernel/power/gemm_kernel_cell.S b/kernel/power/gemm_kernel_cell.S index 26f9cb023..f3d3b8325 100644 --- a/kernel/power/gemm_kernel_cell.S +++ b/kernel/power/gemm_kernel_cell.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -192,7 +192,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -226,7 +226,7 @@ li PREC, 4 * SIZE #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_g4.S b/kernel/power/gemm_kernel_g4.S index a5c4d3a43..259f04c4e 100644 --- a/kernel/power/gemm_kernel_g4.S +++ b/kernel/power/gemm_kernel_g4.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_hummer.S b/kernel/power/gemm_kernel_hummer.S index 6ecbeb3e0..3a8e1edfa 100644 --- a/kernel/power/gemm_kernel_hummer.S +++ b/kernel/power/gemm_kernel_hummer.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/gemm_kernel_power3.S b/kernel/power/gemm_kernel_power3.S index f88bc291c..4a6b5da62 100644 --- a/kernel/power/gemm_kernel_power3.S +++ b/kernel/power/gemm_kernel_power3.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -187,7 +187,7 @@ li PREC, 4 * SIZE #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/gemm_kernel_power6.S b/kernel/power/gemm_kernel_power6.S index b274f7655..1a412c4fb 100644 --- a/kernel/power/gemm_kernel_power6.S +++ b/kernel/power/gemm_kernel_power6.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemm_kernel_ppc440.S b/kernel/power/gemm_kernel_ppc440.S index c5ef6e4e5..b128beb38 100644 --- a/kernel/power/gemm_kernel_ppc440.S +++ b/kernel/power/gemm_kernel_ppc440.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -183,7 +183,7 @@ slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/gemv_n.S b/kernel/power/gemv_n.S index abc61b62e..02160bd61 100644 --- a/kernel/power/gemv_n.S +++ b/kernel/power/gemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -252,7 +252,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_n_ppc440.S b/kernel/power/gemv_n_ppc440.S index 18d804520..beb21200a 100644 --- a/kernel/power/gemv_n_ppc440.S +++ b/kernel/power/gemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -199,7 +199,7 @@ stw r23, 180(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t.S b/kernel/power/gemv_t.S index 25a4dd01b..457753065 100644 --- a/kernel/power/gemv_t.S +++ b/kernel/power/gemv_t.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -260,7 +260,7 @@ stw r29, 220(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/gemv_t_ppc440.S b/kernel/power/gemv_t_ppc440.S index 7d12b07a4..6e560db6c 100644 --- a/kernel/power/gemv_t_ppc440.S +++ b/kernel/power/gemv_t_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -190,7 +190,7 @@ stw r22, 192(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/ger.S b/kernel/power/ger.S index d83546b0d..fd397ce8c 100644 --- a/kernel/power/ger.S +++ b/kernel/power/ger.S @@ -47,7 +47,7 @@ #endif #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -224,7 +224,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/icamax.c b/kernel/power/icamax.c index bd74d20e5..aa0531dc6 100644 --- a/kernel/power/icamax.c +++ b/kernel/power/icamax.c @@ -36,34 +36,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -#define USE_MASK_PERMUTATIONS 1 //with this type of permutation gcc output a little faster code -#if !defined(USE_MASK_PERMUTATIONS) - -static inline __attribute__((always_inline)) __vector float mvec_mergee(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgew %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector float a,__vector float b ){ - __vector float result; - __asm__ ( - "vmrgow %0,%1,%2;\n" - : "=v" (result) - : "v" (a), - "v" (b) - : ); - return result; -} - -#endif + /** * Find maximum index * Warning: requirements n>0 and n % 32 == 0 @@ -75,17 +50,13 @@ static inline __attribute__((always_inline)) __vector float mvec_mergeo(__vector static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { BLASLONG index; - BLASLONG i=0; -#if defined(USE_MASK_PERMUTATIONS) + BLASLONG i; register __vector unsigned int static_index0 = {0,1,2,3}; -#else - register __vector unsigned int static_index0 = {2,0,3,1}; -#endif register __vector unsigned int temp0 = {4,4,4, 4}; //temporary vector register register __vector unsigned int temp1= temp0<<1; //{8,8,8,8} - register __vector unsigned int static_index1=static_index0 +temp0; - register __vector unsigned int static_index2=static_index0 +temp1; - register __vector unsigned int static_index3=static_index1 +temp1; + register __vector unsigned int static_index1=static_index0 +temp0;//{4,5,6,7}; + register __vector unsigned int static_index2=static_index0 +temp1;//{8,9,10,11}; + register __vector unsigned int static_index3=static_index1 +temp1; //{12,13,14,15}; temp0=vec_xor(temp0,temp0); temp1=temp1 <<1 ; //{16,16,16,16} register __vector unsigned int temp_add=temp1 <<1; //{32,32,32,32} @@ -93,11 +64,9 @@ static BLASLONG ciamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *maxf) { register __vector float quadruple_values={0,0,0,0}; register __vector float * v_ptrx=(__vector float *)x; -#if defined(USE_MASK_PERMUTATIONS) register __vector unsigned char real_pack_mask = { 0,1,2,3,8,9,10,11,16,17,18,19, 24,25,26,27}; register __vector unsigned char image_pack_mask= {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31}; -#endif - for(; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 3 ; - m1 = m & -4 ; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - if ( m3 == 0 ) return(0); - - if ( m3 == 3 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if ( lda == 3 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return(0); - } - - - if ( m3 == 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return(0); - } - - if ( m3 == 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return(0); - } - - - return(0); -} - - diff --git a/kernel/power/sgemv_n_8.c b/kernel/power/sgemv_n_8.c deleted file mode 100644 index 9bc93ced6..000000000 --- a/kernel/power/sgemv_n_8.c +++ /dev/null @@ -1,513 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could provide barebone for switching to inline assembly -*/ - -#include "common.h" - -#define NBMAX 4096 - -static void sgemv_kernel_8x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT *a0,*a1,*a2,*a3,*b0,*b1,*b2,*b3; - FLOAT x0,x1,x2,x3,x4,x5,x6,x7; - a0 = ap[0]; - a1 = ap[1]; - a2 = ap[2]; - a3 = ap[3]; - b0 = a0 + lda4 ; - b1 = a1 + lda4 ; - b2 = a2 + lda4 ; - b3 = a3 + lda4 ; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - x4 = xo[4] * *alpha; - x5 = xo[5] * *alpha; - x6 = xo[6] * *alpha; - x7 = xo[7] * *alpha; - __vector float* va0 = (__vector float*)a0; - __vector float* va1 = (__vector float*)a1; - __vector float* va2 = (__vector float*)a2; - __vector float* va3 = (__vector float*)a3; - __vector float* vb0 = (__vector float*)b0; - __vector float* vb1 = (__vector float*)b1; - __vector float* vb2 = (__vector float*)b2; - __vector float* vb3 = (__vector float*)b3; - - register __vector float v_x0 = {x0,x0,x0,x0}; - register __vector float v_x1 = {x1,x1,x1,x1}; - register __vector float v_x2 = {x2,x2,x2,x2}; - register __vector float v_x3 = {x3,x3,x3,x3}; - register __vector float v_x4 = {x4,x4,x4,x4}; - register __vector float v_x5 = {x5,x5,x5,x5}; - register __vector float v_x6 = {x6,x6,x6,x6}; - register __vector float v_x7 = {x7,x7,x7,x7}; - __vector float* v_y =(__vector float*)y; - - for ( i=0; i< n/4; i+=2) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float vb0_1=vb0[i] ; - register __vector float vb0_2=vb0[i+1] ; - register __vector float vb1_1=vb1[i] ; - register __vector float vb1_2=vb1[i+1] ; - register __vector float vb2_1=vb2[i] ; - register __vector float vb2_2=vb2[i+1] ; - register __vector float vb3_1=vb3[i] ; - register __vector float vb3_2=vb3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_1 += v_x4 * vb0_1 + v_x5 * vb1_1 + v_x6 * vb2_1 + v_x7 * vb3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - vy_2 += v_x4 * vb0_2 + v_x5 * vb1_2 + v_x6 * vb2_2 + v_x7 * vb3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) -{ - BLASLONG i; - FLOAT x0,x1,x2,x3; - x0 = xo[0] * *alpha; - x1 = xo[1] * *alpha; - x2 = xo[2] * *alpha; - x3 = xo[3] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float v_x2 = {x2,x2,x2,x2}; - __vector float v_x3 = {x3,x3,x3,x3}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - __vector float* va2 = (__vector float*)ap[2]; - __vector float* va3 = (__vector float*)ap[3]; - - for ( i=0; i< n/4; i+=2 ) - { - register __vector float vy_1=v_y[i]; - register __vector float vy_2=v_y[i+1]; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - vy_1 += v_x0 * va0_1 + v_x1 * va1_1 + v_x2 * va2_1 + v_x3 * va3_1 ; - vy_2 += v_x0 * va0_2 + v_x1 * va1_2 + v_x2 * va2_2 + v_x3 * va3_2 ; - v_y[i] =vy_1; - v_y[i+1] =vy_2; - } - -} - -static void sgemv_kernel_8x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0,x1; - x0 = x[0] * *alpha; - x1 = x[1] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float v_x1 = {x1,x1,x1,x1}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap[0]; - __vector float* va1 = (__vector float*)ap[1]; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; - v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; - } - -} - - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -{ - - BLASLONG i; - FLOAT x0 ; - x0 = x[0] * *alpha; - __vector float v_x0 = {x0,x0,x0,x0}; - __vector float* v_y =(__vector float*)y; - __vector float* va0 = (__vector float*)ap; - - for ( i=0; i< n/4; i+=2 ) - { - v_y[i] += v_x0 * va0[i] ; - v_y[i+1] += v_x0 * va0[i+1] ; - } - -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) -{ - BLASLONG i; - - for ( i=0; i> 3 ; - n2 = n & 7 ; - } - else - { - n1 = n >> 2 ; - n2 = n & 3 ; - - } - - m3 = m & 7 ; - m1 = m - m3; - m2 = (m & (NBMAX-1)) - m3 ; - - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while ( NB == NBMAX ) - { - - m1 -= NB; - if ( m1 < 0) - { - if ( m2 == 0 ) break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if ( inc_y != 1 ) - memset(ybuffer,0,NB*4); - else - ybuffer = y_ptr; - - if ( inc_x == 1 ) - { - - - for( i = 0; i < n1 ; i++) - { - sgemv_kernel_8x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); - ap[0] += lda8; - ap[1] += lda8; - ap[2] += lda8; - ap[3] += lda8; - a_ptr += lda8; - x_ptr += 8; - } - - - if ( n2 & 4 ) - { - sgemv_kernel_8x4(NB,ap,x_ptr,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if ( n2 & 2 ) - { - sgemv_kernel_8x2(NB,ap,x_ptr,ybuffer,&alpha); - a_ptr += lda*2; - x_ptr += 2; - } - - - if ( n2 & 1 ) - { - sgemv_kernel_8x1(NB,a_ptr,x_ptr,ybuffer,&alpha); - a_ptr += lda; - x_ptr += 1; - } - - - } - else - { - - for( i = 0; i < n1 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x4(NB,ap,xbuffer,ybuffer,&alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for( i = 0; i < n2 ; i++) - { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_8x1(NB,a_ptr,xbuffer,ybuffer,&alpha); - a_ptr += lda; - - } - - } - - a += NB; - if ( inc_y != 1 ) - { - add_y(NB,ybuffer,y_ptr,inc_y); - y_ptr += NB * inc_y; - } - else - y_ptr += NB ; - - } - - - if ( m3 & 4 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - FLOAT temp3 = 0.0; - if ( lda == 4 && inc_x ==1 ) - { - - for( i = 0; i < ( n & -4 ); i+=4 ) - { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[6] * x_ptr[1]; - temp3 += a_ptr[3] * x_ptr[0] + a_ptr[7] * x_ptr[1]; - - temp0 += a_ptr[8] * x_ptr[2] + a_ptr[12] * x_ptr[3]; - temp1 += a_ptr[9] * x_ptr[2] + a_ptr[13] * x_ptr[3]; - temp2 += a_ptr[10] * x_ptr[2] + a_ptr[14] * x_ptr[3]; - temp3 += a_ptr[11] * x_ptr[2] + a_ptr[15] * x_ptr[3]; - - a_ptr += 16; - x_ptr += 4; - } - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0] ; - a_ptr +=4; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - temp3 += a_ptr[3] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - y_ptr += inc_y; - y_ptr[0] += alpha * temp3; - y_ptr += inc_y; - a += 4; - } - - - if ( m3 & 2 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if ( lda == 2 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4) ; i+=4 ) - { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - - for( ; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr ++; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - a += 2; - } - - if ( m3 & 1 ) - { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if ( lda == 1 && inc_x ==1 ) - { - - for( i = 0; i < (n & -4); i+=4 ) - { - temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; - - } - - for( ; i < n; i++ ) - { - temp += a_ptr[i] * x_ptr[i]; - } - - } - else - { - - for( i = 0; i < n; i++ ) - { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - - - } - - - return(0); -} - - diff --git a/kernel/power/sgemv_t.c b/kernel/power/sgemv_t.c deleted file mode 100644 index 96434a13f..000000000 --- a/kernel/power/sgemv_t.c +++ /dev/null @@ -1,480 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -#include - -static void sgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - temp4 += v_x[i] * va4[i]; - temp5 += v_x[i] * va5[i]; - temp6 += v_x[i] * va6[i]; - temp7 += v_x[i] * va7[i]; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - temp2 += v_x[i] * va2[i]; - temp3 += v_x[i] * va3[i]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i]; - temp1 += v_x[i] * va1[i]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i ++) { - temp0 += v_x[i] * va0[i] ; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); - -} - diff --git a/kernel/power/sgemv_t_8.c b/kernel/power/sgemv_t_8.c deleted file mode 100644 index 5e9cd63ac..000000000 --- a/kernel/power/sgemv_t_8.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -/****Note*** -UnUsed kernel -This kernel works. But it was not competitive enough to be added in production -It could be used and tested in future or could be used as base for switching to inline assembly -*/ - -#include "common.h" -#include -#define NBMAX 4096 - -#include - -static void sgemv_kernel_8x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i; - FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; - __vector float *va0, *va1, *va2, *va3, *va4, *va5, *va6, *va7, *v_x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - register __vector float temp4 = {0,0,0,0}; - register __vector float temp5 = {0,0,0,0}; - register __vector float temp6 = {0,0,0,0}; - register __vector float temp7 = {0,0,0,0}; - - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - a4 = a3 + lda; - a5 = a4 + lda; - a6 = a5 + lda; - a7 = a6 + lda; - va0 = (__vector float*) a0; - va1 = (__vector float*) a1; - va2 = (__vector float*) a2; - va3 = (__vector float*) a3; - va4 = (__vector float*) a4; - va5 = (__vector float*) a5; - va6 = (__vector float*) a6; - va7 = (__vector float*) a7; - v_x = (__vector float*) x; - - - for (i = 0; i < n/4; i +=2) { - register __vector float vx1=v_x[i] ; - register __vector float vx2=v_x[i+1] ; - register __vector float va0_1=va0[i] ; - register __vector float va0_2=va0[i+1] ; - register __vector float va1_1=va1[i] ; - register __vector float va1_2=va1[i+1] ; - register __vector float va2_1=va2[i] ; - register __vector float va2_2=va2[i+1] ; - register __vector float va3_1=va3[i] ; - register __vector float va3_2=va3[i+1] ; - register __vector float va4_1=va4[i] ; - register __vector float va4_2=va4[i+1] ; - register __vector float va5_1=va5[i] ; - register __vector float va5_2=va5[i+1] ; - register __vector float va6_1=va6[i] ; - register __vector float va6_2=va6[i+1] ; - register __vector float va7_1=va7[i] ; - register __vector float va7_2=va7[i+1] ; - temp0 += vx1* va0_1 + vx2 * va0_2; - temp1 += vx1* va1_1 + vx2 * va1_2; - temp2 += vx1* va2_1 + vx2 * va2_2; - temp3 += vx1* va3_1 + vx2 * va3_2; - temp4 += vx1* va4_1 + vx2 * va4_2; - temp5 += vx1* va5_1 + vx2 * va5_2; - temp6 += vx1* va6_1 + vx2 * va6_2; - temp7 += vx1* va7_1 + vx2 * va7_2; - } - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - - y[4] += alpha * (temp4[0] + temp4[1]+temp4[2] + temp4[3]); - y[5] += alpha * (temp5[0] + temp5[1]+temp5[2] + temp5[3]); - y[6] += alpha * (temp6[0] + temp6[1]+temp6[2] + temp6[3]); - y[7] += alpha * (temp7[0] + temp7[1]+temp7[2] + temp7[3]); - -} - - -static void sgemv_kernel_8x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - BLASLONG i = 0; - FLOAT *a0, *a1, *a2, *a3; - a0 = ap; - a1 = ap + lda; - a2 = a1 + lda; - a3 = a2 + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* va2 = (__vector float*) a2; - __vector float* va3 = (__vector float*) a3; - __vector float* v_x = (__vector float*) x; - register __vector float temp0 = {0,0,0,0}; - register __vector float temp1 = {0,0,0,0}; - register __vector float temp2 = {0,0,0,0}; - register __vector float temp3 = {0,0,0,0}; - - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1]; - temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1]; - } - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[1] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); - y[2] += alpha * (temp2[0] + temp2[1]+temp2[2] + temp2[3]); - y[3] += alpha * (temp3[0] + temp3[1]+temp3[2] + temp3[3]); - -} - - -static void sgemv_kernel_8x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha, BLASLONG inc_y) { - - BLASLONG i; - FLOAT *a0, *a1; - a0 = ap; - a1 = ap + lda; - __vector float* va0 = (__vector float*) a0; - __vector float* va1 = (__vector float*) a1; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - __vector float temp1 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1]; - } - - - - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - y[inc_y] += alpha * (temp1[0] + temp1[1]+temp1[2] + temp1[3]); -} - -static void sgemv_kernel_8x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha) { - - BLASLONG i; - FLOAT *a0; - a0 = ap; - __vector float* va0 = (__vector float*) a0; - __vector float* v_x = (__vector float*) x; - __vector float temp0 = {0,0,0,0}; - for (i = 0; i < n / 4; i +=2) { - temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1]; - } - y[0] += alpha * (temp0[0] + temp0[1]+temp0[2] + temp0[3]); - -} - - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest++ = *src; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - - FLOAT ybuffer[8], *xbuffer; - - if (m < 1) return (0); - if (n < 1) return (0); - - xbuffer = buffer; - - n1 = n >> 3; - n2 = n & 7; - - m3 = m & 7; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x != 1) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - BLASLONG lda8 = lda << 3; - - - if (inc_y == 1) { - - for (i = 0; i < n1; i++) { - - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, y_ptr, alpha); - - y_ptr += 8; - a_ptr += lda8; - - } - - } else { - - for (i = 0; i < n1; i++) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - ybuffer[4] = 0; - ybuffer[5] = 0; - ybuffer[6] = 0; - ybuffer[7] = 0; - sgemv_kernel_8x8(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - - *y_ptr += ybuffer[4]; - y_ptr += inc_y; - *y_ptr += ybuffer[5]; - y_ptr += inc_y; - *y_ptr += ybuffer[6]; - y_ptr += inc_y; - *y_ptr += ybuffer[7]; - y_ptr += inc_y; - - a_ptr += lda8; - } - - } - - - if (n2 & 4) { - ybuffer[0] = 0; - ybuffer[1] = 0; - ybuffer[2] = 0; - ybuffer[3] = 0; - sgemv_kernel_8x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha); - - a_ptr += lda<<2; - - *y_ptr += ybuffer[0]; - y_ptr += inc_y; - *y_ptr += ybuffer[1]; - y_ptr += inc_y; - *y_ptr += ybuffer[2]; - y_ptr += inc_y; - *y_ptr += ybuffer[3]; - y_ptr += inc_y; - } - - if (n2 & 2) { - sgemv_kernel_8x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha, inc_y); - a_ptr += lda << 1; - y_ptr += 2 * inc_y; - - } - - if (n2 & 1) { - sgemv_kernel_8x1(NB, a_ptr, xbuffer, y_ptr, alpha); - a_ptr += lda; - y_ptr += inc_y; - - } - - a += NB; - x += NB * inc_x; - - - } - - if (m3 == 0) return (0); - - x_ptr = x; - a_ptr = a; - if (m3 & 4) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp3 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 4 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - y_ptr[j + 1] += aj[4] * xtemp0 + aj[5] * xtemp1 + aj[6] * xtemp2 + aj[7] * xtemp3; - y_ptr[j + 2] += aj[8] * xtemp0 + aj[9] * xtemp1 + aj[10] * xtemp2 + aj[11] * xtemp3; - y_ptr[j + 3] += aj[12] * xtemp0 + aj[13] * xtemp1 + aj[14] * xtemp2 + aj[15] * xtemp3; - aj += 16; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2 + aj[3] * xtemp3; - aj += 4; - } - - } else if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2 + *(aj + 3) * xtemp3; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + 2) * xtemp2 + *(aj + lda +3) * xtemp3; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + 2) * xtemp2 + *(aj + lda2 +3) * xtemp3; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + 2) * xtemp2 + *(aj + lda3+3) * xtemp3; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+*(aj + 3) * xtemp3; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2+ *(aj + 3) * xtemp3; - y_ptr += inc_y; - aj += lda; - } - - } - if (m3==4) return (0); - a_ptr += 4; - } - - if (m3 & 2 ) { - - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - if (m3==2) return (0); - a_ptr += 2; - } - if (m3 & 1) { - - FLOAT xtemp = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - - } - a_ptr += 1; - } - return (0); - -} - diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 6af813c16..d2910ff87 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "srot_microk_power8.c" #endif diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 4f3ba5698..bd5cdc43f 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "sscal_microk_power8.c" #endif diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index 23d13280f..932652b37 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "sswap_microk_power8.c" #endif diff --git a/kernel/power/strmm_kernel_16x8_power8.S b/kernel/power/strmm_kernel_16x8_power8.S index 78e539231..f9b8a0bb8 100644 --- a/kernel/power/strmm_kernel_16x8_power8.S +++ b/kernel/power/strmm_kernel_16x8_power8.S @@ -96,7 +96,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -271,7 +271,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. slwi LDC, LDC, BASE_SHIFT #if defined(TRMMKERNEL) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/sum.S b/kernel/power/sum.S deleted file mode 100644 index eda2c5f2c..000000000 --- a/kernel/power/sum.S +++ /dev/null @@ -1,446 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N r3 -#define X r4 -#define INCX r5 - -#define PREA r8 - -#define FZERO f0 - -#define STACKSIZE 160 - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - stw r0, 144(SP) - lfs FZERO,144(SP) - -#ifdef F_INTERFACE - LDINT N, 0(N) - LDINT INCX, 0(INCX) -#endif - - slwi INCX, INCX, BASE_SHIFT - - fmr f1, FZERO - fmr f2, FZERO - fmr f3, FZERO - fmr f4, FZERO - fmr f5, FZERO - fmr f6, FZERO - fmr f7, FZERO - - li PREA, L1_PREFETCHSIZE - - cmpwi cr0, N, 0 - ble- LL(999) - - cmpwi cr0, INCX, 0 - ble- LL(999) - - cmpwi cr0, INCX, SIZE - bne- cr0, LL(100) - - srawi. r0, N, 4 - mtspr CTR, r0 - beq- cr0, LL(50) - .align 4 - - LFD f8, 0 * SIZE(X) - LFD f9, 1 * SIZE(X) - LFD f10, 2 * SIZE(X) - LFD f11, 3 * SIZE(X) - LFD f12, 4 * SIZE(X) - LFD f13, 5 * SIZE(X) - LFD f14, 6 * SIZE(X) - LFD f15, 7 * SIZE(X) - - LFD f24, 8 * SIZE(X) - LFD f25, 9 * SIZE(X) - LFD f26, 10 * SIZE(X) - LFD f27, 11 * SIZE(X) - LFD f28, 12 * SIZE(X) - LFD f29, 13 * SIZE(X) - LFD f30, 14 * SIZE(X) - LFD f31, 15 * SIZE(X) - - fmr f16, f8 - fmr f17, f9 - fmr f18, f10 - fmr f19, f11 - - fmr f20, f12 - fmr f21, f13 - fmr f22, f14 - fmr f23, f15 - bdz LL(20) - .align 4 - -LL(10): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - LFD f8, 16 * SIZE(X) - LFD f9, 17 * SIZE(X) - LFD f10, 18 * SIZE(X) - LFD f11, 19 * SIZE(X) - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - LFD f12, 20 * SIZE(X) - LFD f13, 21 * SIZE(X) - LFD f14, 22 * SIZE(X) - LFD f15, 23 * SIZE(X) - - FADD f0, f0, f16 - fmr f16, f8 - FADD f1, f1, f17 - fmr f17, f9 - - FADD f2, f2, f18 - fmr f18, f10 - FADD f3, f3, f19 - fmr f19, f11 - - LFD f24, 24 * SIZE(X) - LFD f25, 25 * SIZE(X) - LFD f26, 26 * SIZE(X) - LFD f27, 27 * SIZE(X) - - FADD f4, f4, f20 - fmr f20, f12 - FADD f5, f5, f21 - fmr f21, f13 - - FADD f6, f6, f22 - fmr f22, f14 - FADD f7, f7, f23 - fmr f23, f15 - - LFD f28, 28 * SIZE(X) - LFD f29, 29 * SIZE(X) - LFD f30, 30 * SIZE(X) - LFD f31, 31 * SIZE(X) - -#ifndef POWER6 - L1_PREFETCH X, PREA -#endif - addi X, X, 16 * SIZE -#ifdef POWER6 - L1_PREFETCH X, PREA -#endif - - bdnz LL(10) - .align 4 - -LL(20): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - FADD f0, f0, f16 - FADD f1, f1, f17 - FADD f2, f2, f18 - FADD f3, f3, f19 - - FADD f4, f4, f20 - FADD f5, f5, f21 - FADD f6, f6, f22 - FADD f7, f7, f23 - addi X, X, 16 * SIZE - .align 4 - -LL(50): - andi. r0, N, 15 - mtspr CTR, r0 - beq LL(999) - .align 4 - -LL(60): - LFD f8, 0 * SIZE(X) - addi X, X, 1 * SIZE - - FADD f0, f0, f8 - - bdnz LL(60) - b LL(999) - .align 4 - -LL(100): - sub X, X, INCX - - srawi. r0, N, 4 - mtspr CTR, r0 - beq- LL(150) - - LFDUX f8, X, INCX - LFDUX f9, X, INCX - LFDUX f10, X, INCX - LFDUX f11, X, INCX - LFDUX f12, X, INCX - LFDUX f13, X, INCX - LFDUX f14, X, INCX - LFDUX f15, X, INCX - - LFDUX f24, X, INCX - LFDUX f25, X, INCX - LFDUX f26, X, INCX - LFDUX f27, X, INCX - LFDUX f28, X, INCX - LFDUX f29, X, INCX - LFDUX f30, X, INCX - LFDUX f31, X, INCX - - fmr f16, f8 - fmr f17, f9 - fmr f18, f10 - fmr f19, f11 - - fmr f20, f12 - fmr f21, f13 - fmr f22, f14 - fmr f23, f15 - bdz LL(120) - .align 4 - -LL(110): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - LFDUX f8, X, INCX - LFDUX f9, X, INCX - LFDUX f10, X, INCX - LFDUX f11, X, INCX - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - LFDUX f12, X, INCX - LFDUX f13, X, INCX - LFDUX f14, X, INCX - LFDUX f15, X, INCX - - FADD f0, f0, f16 - fmr f16, f8 - FADD f1, f1, f17 - fmr f17, f9 - - FADD f2, f2, f18 - fmr f18, f10 - FADD f3, f3, f19 - fmr f19, f11 - - LFDUX f24, X, INCX - LFDUX f25, X, INCX - LFDUX f26, X, INCX - LFDUX f27, X, INCX - - FADD f4, f4, f20 - fmr f20, f12 - FADD f5, f5, f21 - fmr f21, f13 - - FADD f6, f6, f22 - fmr f22, f14 - FADD f7, f7, f23 - fmr f23, f15 - - LFDUX f28, X, INCX - LFDUX f29, X, INCX - LFDUX f30, X, INCX - LFDUX f31, X, INCX - bdnz LL(110) - .align 4 - -LL(120): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - FADD f0, f0, f16 - FADD f1, f1, f17 - FADD f2, f2, f18 - FADD f3, f3, f19 - - FADD f4, f4, f20 - FADD f5, f5, f21 - FADD f6, f6, f22 - FADD f7, f7, f23 - .align 4 - -LL(150): - andi. r0, N, 15 - mtspr CTR, r0 - beq LL(999) - .align 4 - -LL(160): - LFDUX f8, X, INCX - FADD f0, f0, f8 - bdnz LL(160) - .align 4 - -LL(999): - FADD f0, f0, f1 - FADD f2, f2, f3 - FADD f4, f4, f5 - FADD f6, f6, f7 - - FADD f0, f0, f2 - FADD f4, f4, f6 - FADD f1, f0, f4 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE diff --git a/kernel/power/swap.S b/kernel/power/swap.S index c9c0f86b0..e862b17bb 100644 --- a/kernel/power/swap.S +++ b/kernel/power/swap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 diff --git a/kernel/power/symv_L.S b/kernel/power/symv_L.S index a4ff703e2..f7d768c50 100644 --- a/kernel/power/symv_L.S +++ b/kernel/power/symv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -248,7 +248,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/symv_U.S b/kernel/power/symv_U.S index c3063e077..d8e082397 100644 --- a/kernel/power/symv_U.S +++ b/kernel/power/symv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define IS r4 @@ -247,7 +247,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/trsm_kernel_LN.S b/kernel/power/trsm_kernel_LN.S index 8319d5ed8..7983c573b 100644 --- a/kernel/power/trsm_kernel_LN.S +++ b/kernel/power/trsm_kernel_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -236,7 +236,7 @@ #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_LT.S b/kernel/power/trsm_kernel_LT.S index 30f25e015..c561fd014 100644 --- a/kernel/power/trsm_kernel_LT.S +++ b/kernel/power/trsm_kernel_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_RT.S b/kernel/power/trsm_kernel_RT.S index d39d3a6e2..07b88402c 100644 --- a/kernel/power/trsm_kernel_RT.S +++ b/kernel/power/trsm_kernel_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -254,7 +254,7 @@ #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LN.S b/kernel/power/trsm_kernel_cell_LN.S index f656015a8..803530cbb 100644 --- a/kernel/power/trsm_kernel_cell_LN.S +++ b/kernel/power/trsm_kernel_cell_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_LT.S b/kernel/power/trsm_kernel_cell_LT.S index 083af7289..105e7d43c 100644 --- a/kernel/power/trsm_kernel_cell_LT.S +++ b/kernel/power/trsm_kernel_cell_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -257,7 +257,7 @@ #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_cell_RT.S b/kernel/power/trsm_kernel_cell_RT.S index 5a5b67e77..a54a261cb 100644 --- a/kernel/power/trsm_kernel_cell_RT.S +++ b/kernel/power/trsm_kernel_cell_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ li PREC, -4 * SIZE #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ mr PREA, r10 lwz PREB, FRAMESLOT(0) + STACKSIZE(SP) diff --git a/kernel/power/trsm_kernel_hummer_LN.S b/kernel/power/trsm_kernel_hummer_LN.S index 35ffab427..109dacb8c 100644 --- a/kernel/power/trsm_kernel_hummer_LN.S +++ b/kernel/power/trsm_kernel_hummer_LN.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_LT.S b/kernel/power/trsm_kernel_hummer_LT.S index f7a09dbd8..1ad062a7c 100644 --- a/kernel/power/trsm_kernel_hummer_LT.S +++ b/kernel/power/trsm_kernel_hummer_LT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_hummer_RT.S b/kernel/power/trsm_kernel_hummer_RT.S index 0e563e5cc..94b3c0c85 100644 --- a/kernel/power/trsm_kernel_hummer_RT.S +++ b/kernel/power/trsm_kernel_hummer_RT.S @@ -46,7 +46,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/trsm_kernel_power6_LN.S b/kernel/power/trsm_kernel_power6_LN.S index 83594c772..937a6761a 100644 --- a/kernel/power/trsm_kernel_power6_LN.S +++ b/kernel/power/trsm_kernel_power6_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_LT.S b/kernel/power/trsm_kernel_power6_LT.S index 54a8547b0..924f00ec0 100644 --- a/kernel/power/trsm_kernel_power6_LT.S +++ b/kernel/power/trsm_kernel_power6_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -180,7 +180,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_power6_RT.S b/kernel/power/trsm_kernel_power6_RT.S index b2b27613c..40ee5e28d 100644 --- a/kernel/power/trsm_kernel_power6_RT.S +++ b/kernel/power/trsm_kernel_power6_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -179,7 +179,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LN.S b/kernel/power/trsm_kernel_ppc440_LN.S index a708a084d..6b7312101 100644 --- a/kernel/power/trsm_kernel_ppc440_LN.S +++ b/kernel/power/trsm_kernel_ppc440_LN.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_LT.S b/kernel/power/trsm_kernel_ppc440_LT.S index 31f82de2c..28b109b96 100644 --- a/kernel/power/trsm_kernel_ppc440_LT.S +++ b/kernel/power/trsm_kernel_ppc440_LT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -176,7 +176,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/trsm_kernel_ppc440_RT.S b/kernel/power/trsm_kernel_ppc440_RT.S index f5005403c..df80cd393 100644 --- a/kernel/power/trsm_kernel_ppc440_RT.S +++ b/kernel/power/trsm_kernel_ppc440_RT.S @@ -59,7 +59,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -191,7 +191,7 @@ slwi LDC, LDC, BASE_SHIFT -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c index f61c62e75..0b6b87d46 100644 --- a/kernel/power/zasum.c +++ b/kernel/power/zasum.c @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "zasum_microk_power8.c" #endif diff --git a/kernel/power/zaxpy.S b/kernel/power/zaxpy.S index b001f42d1..ac5b249bb 100644 --- a/kernel/power/zaxpy.S +++ b/kernel/power/zaxpy.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 @@ -123,7 +123,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c index f0f8c6910..dd7ab6c3c 100644 --- a/kernel/power/zaxpy.c +++ b/kernel/power/zaxpy.c @@ -36,18 +36,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "zaxpy_microk_power8.c" #endif #ifndef HAVE_KERNEL_4 -static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +static void zaxpy_kernel_4(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { BLASLONG register i = 0; BLASLONG register ix = 0; - + FLOAT da_r = alpha[0]; + FLOAT da_i = alpha[1]; while(i < n) diff --git a/kernel/power/zaxpy_ppc440.S b/kernel/power/zaxpy_ppc440.S index 848a0135f..b5c604e91 100644 --- a/kernel/power/zaxpy_ppc440.S +++ b/kernel/power/zaxpy_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 @@ -112,7 +112,7 @@ stfd f24, 80(SP) stfd f25, 88(SP) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zcopy.c b/kernel/power/zcopy.c index b21d6ef15..a7658f7ab 100644 --- a/kernel/power/zcopy.c +++ b/kernel/power/zcopy.c @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "zcopy_microk_power8.c" #endif diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c index fd36c7f44..b83f832b1 100644 --- a/kernel/power/zdot.c +++ b/kernel/power/zdot.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "zdot_microk_power8.c" #endif diff --git a/kernel/power/zgemm_beta.S b/kernel/power/zgemm_beta.S index 57c3bed50..43b72ca15 100644 --- a/kernel/power/zgemm_beta.S +++ b/kernel/power/zgemm_beta.S @@ -62,7 +62,7 @@ stfd f31, 8(SP) stw r0, 16(SP) -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz LDC, FRAMESLOT(0) + STACKSIZE(SP) #else @@ -134,7 +134,7 @@ LL(12): STFD f0, 14 * SIZE(CO1) STFD f0, 15 * SIZE(CO1) - dcbtst PRE, CO1 + dcbst PRE, CO1 addi CO1, CO1, 16 * SIZE bdnz LL(12) .align 4 diff --git a/kernel/power/zgemm_kernel.S b/kernel/power/zgemm_kernel.S index ae8a93e89..8ec8b674a 100644 --- a/kernel/power/zgemm_kernel.S +++ b/kernel/power/zgemm_kernel.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -169,7 +169,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -190,7 +190,7 @@ #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -231,7 +231,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_8x2_power8.S b/kernel/power/zgemm_kernel_8x2_power8.S index dfe2d9dc6..5526b91c9 100644 --- a/kernel/power/zgemm_kernel_8x2_power8.S +++ b/kernel/power/zgemm_kernel_8x2_power8.S @@ -132,7 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -296,7 +296,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) #endif @@ -317,7 +317,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) #endif diff --git a/kernel/power/zgemm_kernel_altivec.S b/kernel/power/zgemm_kernel_altivec.S index 2525a8e58..2b650cd02 100644 --- a/kernel/power/zgemm_kernel_altivec.S +++ b/kernel/power/zgemm_kernel_altivec.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -264,7 +264,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_cell.S b/kernel/power/zgemm_kernel_altivec_cell.S index 47a79064d..642d1f2e7 100644 --- a/kernel/power/zgemm_kernel_altivec_cell.S +++ b/kernel/power/zgemm_kernel_altivec_cell.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -244,7 +244,7 @@ #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -270,7 +270,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREB, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_altivec_g4.S b/kernel/power/zgemm_kernel_altivec_g4.S index c305270bd..0f7a6f9aa 100644 --- a/kernel/power/zgemm_kernel_altivec_g4.S +++ b/kernel/power/zgemm_kernel_altivec_g4.S @@ -62,7 +62,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -238,7 +238,7 @@ #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_cell.S b/kernel/power/zgemm_kernel_cell.S index 3d179378b..8fd6b0afb 100644 --- a/kernel/power/zgemm_kernel_cell.S +++ b/kernel/power/zgemm_kernel_cell.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -175,7 +175,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -196,7 +196,7 @@ #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -230,7 +230,7 @@ li PREA, 16 * 12 * SIZE #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_g4.S b/kernel/power/zgemm_kernel_g4.S index b92fb4225..bf6bf77e8 100644 --- a/kernel/power/zgemm_kernel_g4.S +++ b/kernel/power/zgemm_kernel_g4.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -185,7 +185,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -206,7 +206,7 @@ #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_hummer.S b/kernel/power/zgemm_kernel_hummer.S index 5546dd2f6..991a64373 100644 --- a/kernel/power/zgemm_kernel_hummer.S +++ b/kernel/power/zgemm_kernel_hummer.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/zgemm_kernel_power3.S b/kernel/power/zgemm_kernel_power3.S index d14cb1cd9..471d3b9ae 100644 --- a/kernel/power/zgemm_kernel_power3.S +++ b/kernel/power/zgemm_kernel_power3.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -161,7 +161,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -202,7 +202,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/zgemm_kernel_power6.S b/kernel/power/zgemm_kernel_power6.S index 9b47b9fc1..3c28649bc 100644 --- a/kernel/power/zgemm_kernel_power6.S +++ b/kernel/power/zgemm_kernel_power6.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -199,7 +199,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -220,7 +220,7 @@ #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_kernel_power9.S b/kernel/power/zgemm_kernel_power9.S deleted file mode 100644 index d1e60da6c..000000000 --- a/kernel/power/zgemm_kernel_power9.S +++ /dev/null @@ -1,245 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define ASSEMBLER -#include "common.h" -#include "def_vsx.h" - -#define LOAD ld - -#define STACKSIZE 512 - -#define FZERO 312+192(SP) - -#define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ - -#define M r3 -#define N r4 -#define K r5 - - -#define A r8 -#define B r9 -#define C r10 -#define LDC r6 -#define OFFSET r7 - - - -#define o0 0 -#define alpha_r vs30 -#define alpha_i vs31 - -#define VECSAVE r11 - -#define FRAMEPOINTER r12 - -#define T10 r14 - -#define L r15 -#define T8 r16 -#define T5 r17 -#define T2 r19 -#define TEMP_REG r20 -#define T6 r21 -#define I r22 -#define J r23 -#define AO r24 -#define BO r25 -#define CO r26 -#define T7 r27 -#define T3 r28 -#define T4 r29 - -#define PRE r30 -#define T1 r31 - -#ifndef NEEDPARAM - - PROLOGUE - PROFCODE - - mr FRAMEPOINTER, SP - addi SP, SP, -STACKSIZE - mflr r0 - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - xxspltd alpha_r,vs1,0 /*copy from register f1 */ - xxspltd alpha_i,vs2,0 /*copy from register f2 */ - - std r31, 144(SP) - std r30, 152(SP) - std r29, 160(SP) - std r28, 168(SP) - std r27, 176(SP) - std r26, 184(SP) - std r25, 192(SP) - std r24, 200(SP) - std r23, 208(SP) - std r22, 216(SP) - std r21, 224(SP) - std r20, 232(SP) - std r19, 240(SP) - std r18, 248(SP) - std r17, 256(SP) - std r16, 264(SP) - std r15, 272(SP) - std r14, 280(SP) - - - stxv vs52, 288(SP) - stxv vs53, 304(SP) - stxv vs54, 320(SP) - stxv vs55, 336(SP) - stxv vs56, 352(SP) - stxv vs57, 368(SP) - stxv vs58, 384(SP) - stxv vs59, 400(SP) - stxv vs60, 416(SP) - stxv vs61, 432(SP) - stxv vs62, 448(SP) - stxv vs63, 464(SP) - - std r0, FLINK_SAVE(SP) - - -#if defined(linux) || defined(__FreeBSD__) - ld LDC, FRAMESLOT(0) + 0(FRAMEPOINTER) -#endif - - -#ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) - ld OFFSET, FRAMESLOT(1) + 0(FRAMEPOINTER) -#endif -#endif - - -#include "zgemm_macros_power9.S" - - - - slwi LDC, LDC, ZBASE_SHIFT - li PRE, 512 - li r0, 0 - - -#if defined(CC) || defined(CR) || defined(RC) || defined(RR) -/*negate for this case as we will use addition -1*(a+b) */ - xvnegdp alpha_r,alpha_r - xvnegdp alpha_i,alpha_i -#endif - .align 4 - -#include "zgemm_logic_power9.S" - -L999: - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - - ld r31, 144(SP) - ld r30, 152(SP) - ld r29, 160(SP) - ld r28, 168(SP) - ld r27, 176(SP) - ld r26, 184(SP) - ld r25, 192(SP) - ld r24, 200(SP) - ld r23, 208(SP) - ld r22, 216(SP) - ld r21, 224(SP) - ld r20, 232(SP) - ld r19, 240(SP) - ld r18, 248(SP) - ld r17, 256(SP) - ld r16, 264(SP) - ld r15, 272(SP) - ld r14, 280(SP) - - ld r0, FLINK_SAVE(SP) - - lxv vs52, 288(SP) - lxv vs53, 304(SP) - lxv vs54, 320(SP) - lxv vs55, 336(SP) - lxv vs56, 352(SP) - lxv vs57, 368(SP) - lxv vs58, 384(SP) - lxv vs59, 400(SP) - mtlr r0 - lxv vs60, 416(SP) - lxv vs61, 432(SP) - lxv vs62, 448(SP) - lxv vs63, 464(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE -#endif \ No newline at end of file diff --git a/kernel/power/zgemm_kernel_ppc440.S b/kernel/power/zgemm_kernel_ppc440.S index ba99a21c5..748b69a0c 100644 --- a/kernel/power/zgemm_kernel_ppc440.S +++ b/kernel/power/zgemm_kernel_ppc440.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -182,7 +182,7 @@ stfd f2, ALPHA_I stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -203,7 +203,7 @@ #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/zgemm_logic_power9.S b/kernel/power/zgemm_logic_power9.S deleted file mode 100644 index fe5d8ade2..000000000 --- a/kernel/power/zgemm_logic_power9.S +++ /dev/null @@ -1,1891 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ -#define MY_ALIGN .align 3 -b ZGEMM_L2 -/* MINI SUBROUTINES */ -/* 2x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L2x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x8_2 - MY_ALIGN -ZGEMM_L2x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 -ZGEMM_L2x8_K128: -/*----------------------------------------*/ - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_L2 256,64,31,0 - KERNEL2x8_L2 256,64,32,0 - KERNEL2x8_L2 256,64,33,0 - KERNEL2x8_L2 256,64,34,0 - KERNEL2x8_L2 256,64,35,0 - KERNEL2x8_L2 256,64,36,0 - KERNEL2x8_L2 256,64,37,0 - KERNEL2x8_L2 256,64,38,0 - KERNEL2x8_L2 256,64,39,0 - KERNEL2x8_L2 256,64,40,0 - KERNEL2x8_L2 256,64,41,0 - KERNEL2x8_L2 256,64,42,0 - KERNEL2x8_L2 256,64,43,0 - KERNEL2x8_L2 256,64,44,0 - KERNEL2x8_L2 256,64,45,0 - KERNEL2x8_L2 256,64,46,0 - KERNEL2x8_L2 256,64,47,0 - KERNEL2x8_L2 256,64,48,0 - KERNEL2x8_L2 256,64,49,0 - KERNEL2x8_L2 256,64,50,0 - KERNEL2x8_L2 256,64,51,0 - KERNEL2x8_L2 256,64,52,0 - KERNEL2x8_L2 256,64,53,0 - KERNEL2x8_L2 256,64,54,0 - KERNEL2x8_L2 256,64,55,0 - KERNEL2x8_L2 256,64,56,0 - KERNEL2x8_L2 256,64,57,0 - KERNEL2x8_L2 256,64,58,0 - KERNEL2x8_L2 256,64,59,0 - KERNEL2x8_L2 256,64,60,0 - KERNEL2x8_L2 256,64,61,0 - KERNEL2x8_L2 256,64,62,0 - KERNEL2x8_L2 256,64,63,1 - bdnz ZGEMM_L2x8_LOOP - MY_ALIGN -ZGEMM_L2x8_LOOP_END: -/*----------------------------------------*/ - END2x8_2 - blr - MY_ALIGN - - -ZGEMM_2x8_L64_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_L2 256,64,15,0 - KERNEL2x8_L2 256,64,16,0 - KERNEL2x8_L2 256,64,17,0 - KERNEL2x8_L2 256,64,18,0 - KERNEL2x8_L2 256,64,19,0 - KERNEL2x8_L2 256,64,20,0 - KERNEL2x8_L2 256,64,21,0 - KERNEL2x8_L2 256,64,22,0 - KERNEL2x8_L2 256,64,23,0 - KERNEL2x8_L2 256,64,24,0 - KERNEL2x8_L2 256,64,25,0 - KERNEL2x8_L2 256,64,26,0 - KERNEL2x8_L2 256,64,27,0 - KERNEL2x8_L2 256,64,28,0 - KERNEL2x8_L2 256,64,29,0 - KERNEL2x8_L2 256,64,30,0 - KERNEL2x8_E2 256,64,31,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L32_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_L2 256,64,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL2x8_L2 256,64,8,0 - KERNEL2x8_L2 256,64,9,0 - KERNEL2x8_L2 256,64,10,0 - KERNEL2x8_L2 256,64,11,0 - dcbt BO, T4 - KERNEL2x8_L2 256,64,12,0 - KERNEL2x8_L2 256,64,13,0 - KERNEL2x8_L2 256,64,14,0 - KERNEL2x8_E2 256,64,15,1 - blr - MY_ALIGN - - -ZGEMM_2x8_L16_SUB: -/*----------------------------------------*/ - LOAD2x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL2x8_L2 256,64,0,0 - KERNEL2x8_L2 256,64,1,0 - dcbt AO, T2 - KERNEL2x8_L2 256,64,2,0 - KERNEL2x8_L2 256,64,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL2x8_L2 256,64,4,0 - KERNEL2x8_L2 256,64,5,0 - dcbt AO, T4 - KERNEL2x8_L2 256,64,6,0 - KERNEL2x8_E2 256,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x4_2 - MY_ALIGN -ZGEMM_L2x4_LOOP: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,0,0 -ZGEMM_L2x4_K32: -/*----------------------------------------*/ - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_L2 128,64,7,0 - KERNEL2x4_L2 128,64,8,0 - KERNEL2x4_L2 128,64,9,0 - KERNEL2x4_L2 128,64,10,0 - KERNEL2x4_L2 128,64,11,0 - KERNEL2x4_L2 128,64,12,0 - KERNEL2x4_L2 128,64,13,0 - KERNEL2x4_L2 128,64,14,0 - KERNEL2x4_L2 128,64,15,1 - bdnz ZGEMM_L2x4_LOOP - MY_ALIGN -ZGEMM_L2x4_LOOP_END: -/*----------------------------------------*/ - END2x4_2 - blr - MY_ALIGN - - -ZGEMM_2x4_L16_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_L2 128,64,3,0 - KERNEL2x4_L2 128,64,4,0 - KERNEL2x4_L2 128,64,5,0 - KERNEL2x4_L2 128,64,6,0 - KERNEL2x4_E2 128,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x4_L8_SUB: -/*----------------------------------------*/ - LOAD2x4_2 - KERNEL2x4_L2 128,64,0,0 - KERNEL2x4_L2 128,64,1,0 - KERNEL2x4_L2 128,64,2,0 - KERNEL2x4_E2 128,64,3,1 - blr - - -ZGEMM_2x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x2_2 - MY_ALIGN -ZGEMM_L2x2_LOOP: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,0,0 -ZGEMM_L2x2_K32: -/*----------------------------------------*/ - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_L2 64,64,7,0 - KERNEL2x2_L2 64,64,8,0 - KERNEL2x2_L2 64,64,9,0 - KERNEL2x2_L2 64,64,10,0 - KERNEL2x2_L2 64,64,11,0 - KERNEL2x2_L2 64,64,12,0 - KERNEL2x2_L2 64,64,13,0 - KERNEL2x2_L2 64,64,14,0 - KERNEL2x2_L2 64,64,15,1 - bdnz ZGEMM_L2x2_LOOP - MY_ALIGN - - -ZGEMM_L2x2_LOOP_END: -/*----------------------------------------*/ - END2x2_2 - blr - MY_ALIGN -ZGEMM_2x2_L16_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_L2 64,64,3,0 - KERNEL2x2_L2 64,64,4,0 - KERNEL2x2_L2 64,64,5,0 - KERNEL2x2_L2 64,64,6,0 - KERNEL2x2_E2 64,64,7,1 - blr - MY_ALIGN -ZGEMM_2x2_L8_SUB: -/*----------------------------------------*/ - LOAD2x2_2 - KERNEL2x2_L2 64,64,0,0 - KERNEL2x2_L2 64,64,1,0 - KERNEL2x2_L2 64,64,2,0 - KERNEL2x2_E2 64,64,3,1 - blr - - -ZGEMM_2x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD2x1_2 - MY_ALIGN -ZGEMM_L2x1_LOOP: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,0,0 -ZGEMM_L2x1_K32: -/*----------------------------------------*/ - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_L2 32,64,7,0 - KERNEL2x1_L2 32,64,8,0 - KERNEL2x1_L2 32,64,9,0 - KERNEL2x1_L2 32,64,10,0 - KERNEL2x1_L2 32,64,11,0 - KERNEL2x1_L2 32,64,12,0 - KERNEL2x1_L2 32,64,13,0 - KERNEL2x1_L2 32,64,14,0 - KERNEL2x1_L2 32,64,15,1 - bdnz ZGEMM_L2x1_LOOP - MY_ALIGN -ZGEMM_L2x1_LOOP_END: -/*----------------------------------------*/ - END2x1_2 - blr - - MY_ALIGN -ZGEMM_2x1_L16_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_L2 32,64,3,0 - KERNEL2x1_L2 32,64,4,0 - KERNEL2x1_L2 32,64,5,0 - KERNEL2x1_L2 32,64,6,0 - KERNEL2x1_E2 32,64,7,1 - blr - MY_ALIGN - - -ZGEMM_2x1_L8_SUB: -/*----------------------------------------*/ - LOAD2x1_2 - KERNEL2x1_L2 32,64,0,0 - KERNEL2x1_L2 32,64,1,0 - KERNEL2x1_L2 32,64,2,0 - KERNEL2x1_E2 32,64,3,1 - blr - - - -/* MAIN LOOP BEGINS */ - MY_ALIGN - - -ZGEMM_L2: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - neg TEMP_REG, OFFSET -#endif - srawi. J, N, 1 - ble ZGEMM_L2_END - - -ZGEMM_L2_BEGIN: -/*----------------------------------------*/ - mr CO, C - slwi T1, LDC , 1 - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L2x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L2x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,2 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,2 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO2x8 - ble ZGEMM_L2x8_SUB0 - bl ZGEMM_L2x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L2x8_SAVE - b ZGEMM_L2x8_SUB2 - - -ZGEMM_L2x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP2x8_128K - addi BO,BO,-32 - addi AO,AO,-128 - LOAD2x8O 128,32 - END2x8_WITHOUT_ADD - LOAD2x8_2O 256, 64 - mtctr T8 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - CMP2x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L2x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-256 - LOAD2x8_2O 256,64 - bl ZGEMM_L2x8_K128 - b ZGEMM_L2x8_SAVE - MY_ALIGN - - -ZGEMM_L2x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L2x8_SUB2_32 - bl ZGEMM_2x8_L64_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L2x8_SUB2_16 - bl ZGEMM_2x8_L32_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x8_SUB2_8 - bl ZGEMM_2x8_L16_SUB - MY_ALIGN - - -ZGEMM_L2x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x8_SUB2_4 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_L2 256,64, 1,0 - KERNEL2x8_L2 256,64, 2,0 - KERNEL2x8_E2 256,64, 3,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x8_SUB2_2 - LOAD2x8_2 - KERNEL2x8_L2 256,64, 0,0 - KERNEL2x8_E2 256,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x8_SUB2_1 - LOAD2x8_2 - KERNEL2x8_E2 256,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x8_SAVE - KERNEL2x8 - - -ZGEMM_L2x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE2x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,2 -#endif - bgt ZGEMM_L2x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END - b ZGEMM_L2x4_BEGIN - MY_ALIGN - - -ZGEMM_L2x8_END: -/*----------------------------------------*/ - - -ZGEMM_L2x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L2x1_END - andi. T1, M, 4 - ble ZGEMM_L2x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x4 - ble ZGEMM_L2x4_SUB0 - bl ZGEMM_2x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x4_SAVE - b ZGEMM_L2x4_SUB2 - - -ZGEMM_L2x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x4_32K - addi BO,BO,-32 - addi AO,AO,-64 - LOAD2x4O 64,32 - END2x4_WITHOUT_ADD - LOAD2x4_2O 128, 64 - mtctr T8 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - CMP2x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-128 - LOAD2x4_2O 128,64 - bl ZGEMM_L2x4_K32 - b ZGEMM_L2x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x4_SUB2_8 - bl ZGEMM_2x4_L16_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x4_SUB2_4 - bl ZGEMM_2x4_L8_SUB - MY_ALIGN - - -ZGEMM_L2x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x4_SUB2_2 - LOAD2x4_2 - KERNEL2x4_L2 128,64, 0,0 - KERNEL2x4_E2 128,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x4_SUB2_1 - LOAD2x4_2 - KERNEL2x4_E2 128,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x4_SAVE - KERNEL2x4 - - -ZGEMM_L2x4_SAVE: -/*----------------------------------------*/ - SAVE2x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,2 -#endif - - -ZGEMM_L2x4_END: -/*----------------------------------------*/ - - -ZGEMM_L2x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L2x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x2 - ble ZGEMM_L2x2_SUB0 - bl ZGEMM_2x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x2_SAVE - b ZGEMM_L2x2_SUB2 - - -ZGEMM_L2x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x2_32K - addi BO,BO,-32 - addi AO,AO,-32 - LOAD2x2O 32,32 - END2x2_WITHOUT_ADD - LOAD2x2_2O 64, 64 - mtctr T8 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - CMP2x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-64 - LOAD2x2_2O 64,64 - bl ZGEMM_L2x2_K32 - b ZGEMM_L2x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x2_SUB2_8 - bl ZGEMM_2x2_L16_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x2_SUB2_4 - bl ZGEMM_2x2_L8_SUB - MY_ALIGN - - -ZGEMM_L2x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x2_SUB2_2 - LOAD2x2_2 - KERNEL2x2_L2 64,64, 0,0 - KERNEL2x2_E2 64,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x2_SUB2_1 - LOAD2x2_2 - KERNEL2x2_E2 64,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x2_SAVE - KERNEL2x2 - - -ZGEMM_L2x2_SAVE: -/*----------------------------------------*/ - SAVE2x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,2 -#endif - - -ZGEMM_L2x2_END: -/*----------------------------------------*/ - - -ZGEMM_L2x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L2x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,2 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,2 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO2x1 - ble ZGEMM_L2x1_SUB0 - bl ZGEMM_2x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L2x1_SAVE - b ZGEMM_L2x1_SUB2 - - -ZGEMM_L2x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP2x1_32K - addi BO,BO,-32 - addi AO,AO,-16 - LOAD2x1O 16,32 - END2x1_WITHOUT_ADD - LOAD2x1_2O 32, 64 - mtctr T8 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - CMP2x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L2x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-64 - addi AO,AO,-32 - LOAD2x1_2O 32,64 - bl ZGEMM_L2x1_K32 - b ZGEMM_L2x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L2x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L2x1_SUB2_8 - bl ZGEMM_2x1_L16_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L2x1_SUB2_4 - bl ZGEMM_2x1_L8_SUB - MY_ALIGN - - -ZGEMM_L2x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L2x1_SUB2_2 - LOAD2x1_2 - KERNEL2x1_L2 32,64, 0,0 - KERNEL2x1_E2 32,64, 1,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L2x1_SUB2_1 - LOAD2x1_2 - KERNEL2x1_E2 32,64, 0,1 - MY_ALIGN - - -ZGEMM_L2x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L2x1_SAVE - KERNEL2x1 - - -ZGEMM_L2x1_SAVE: -/*----------------------------------------*/ - SAVE2x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,2 -#endif - - -ZGEMM_L2x1_END: -/*----------------------------------------*/ - slwi T1, K, 5 - addic. J, J, -1 - add B, B, T1 -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 2 -#endif - bgt ZGEMM_L2_BEGIN - - -ZGEMM_L2_END: - -b ZGEMM_L1 -/* MINI SUBROUTINES */ -/* 1x8 MAIN 128x+2 LOOP */ - - -ZGEMM_L1x8_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x8_2 - MY_ALIGN -ZGEMM_L1x8_LOOP: -/*----------------------------------------*/ - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 -ZGEMM_L1x8_K128: -/*----------------------------------------*/ - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_L2 256,32,31,0 - KERNEL1x8_L2 256,32,32,0 - KERNEL1x8_L2 256,32,33,0 - KERNEL1x8_L2 256,32,34,0 - KERNEL1x8_L2 256,32,35,0 - KERNEL1x8_L2 256,32,36,0 - KERNEL1x8_L2 256,32,37,0 - KERNEL1x8_L2 256,32,38,0 - KERNEL1x8_L2 256,32,39,0 - KERNEL1x8_L2 256,32,40,0 - KERNEL1x8_L2 256,32,41,0 - KERNEL1x8_L2 256,32,42,0 - KERNEL1x8_L2 256,32,43,0 - KERNEL1x8_L2 256,32,44,0 - KERNEL1x8_L2 256,32,45,0 - KERNEL1x8_L2 256,32,46,0 - KERNEL1x8_L2 256,32,47,0 - KERNEL1x8_L2 256,32,48,0 - KERNEL1x8_L2 256,32,49,0 - KERNEL1x8_L2 256,32,50,0 - KERNEL1x8_L2 256,32,51,0 - KERNEL1x8_L2 256,32,52,0 - KERNEL1x8_L2 256,32,53,0 - KERNEL1x8_L2 256,32,54,0 - KERNEL1x8_L2 256,32,55,0 - KERNEL1x8_L2 256,32,56,0 - KERNEL1x8_L2 256,32,57,0 - KERNEL1x8_L2 256,32,58,0 - KERNEL1x8_L2 256,32,59,0 - KERNEL1x8_L2 256,32,60,0 - KERNEL1x8_L2 256,32,61,0 - KERNEL1x8_L2 256,32,62,0 - KERNEL1x8_L2 256,32,63,1 - bdnz ZGEMM_L1x8_LOOP - MY_ALIGN -ZGEMM_L1x8_LOOP_END: -/*----------------------------------------*/ - END1x8_2 - blr - MY_ALIGN - - -ZGEMM_1x8_L64_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_L2 256,32,15,0 - KERNEL1x8_L2 256,32,16,0 - KERNEL1x8_L2 256,32,17,0 - KERNEL1x8_L2 256,32,18,0 - KERNEL1x8_L2 256,32,19,0 - KERNEL1x8_L2 256,32,20,0 - KERNEL1x8_L2 256,32,21,0 - KERNEL1x8_L2 256,32,22,0 - KERNEL1x8_L2 256,32,23,0 - KERNEL1x8_L2 256,32,24,0 - KERNEL1x8_L2 256,32,25,0 - KERNEL1x8_L2 256,32,26,0 - KERNEL1x8_L2 256,32,27,0 - KERNEL1x8_L2 256,32,28,0 - KERNEL1x8_L2 256,32,29,0 - KERNEL1x8_L2 256,32,30,0 - KERNEL1x8_E2 256,32,31,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L32_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_L2 256,32,7,0 - dcbt AO, T5 - dcbt BO, T3 - KERNEL1x8_L2 256,32,8,0 - KERNEL1x8_L2 256,32,9,0 - KERNEL1x8_L2 256,32,10,0 - KERNEL1x8_L2 256,32,11,0 - dcbt BO, T4 - KERNEL1x8_L2 256,32,12,0 - KERNEL1x8_L2 256,32,13,0 - KERNEL1x8_L2 256,32,14,0 - KERNEL1x8_E2 256,32,15,1 - blr - MY_ALIGN - - -ZGEMM_1x8_L16_SUB: -/*----------------------------------------*/ - LOAD1x8_2 - dcbt AO, PRE - dcbt BO, PRE - KERNEL1x8_L2 256,32,0,0 - KERNEL1x8_L2 256,32,1,0 - dcbt AO, T2 - KERNEL1x8_L2 256,32,2,0 - KERNEL1x8_L2 256,32,3,0 - dcbt AO, T3 - dcbt BO, T2 - KERNEL1x8_L2 256,32,4,0 - KERNEL1x8_L2 256,32,5,0 - dcbt AO, T4 - KERNEL1x8_L2 256,32,6,0 - KERNEL1x8_E2 256,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x4_2 - MY_ALIGN - - -ZGEMM_L1x4_LOOP: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,0,0 - - -ZGEMM_L1x4_K32: -/*----------------------------------------*/ - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_L2 128,32,7,0 - KERNEL1x4_L2 128,32,8,0 - KERNEL1x4_L2 128,32,9,0 - KERNEL1x4_L2 128,32,10,0 - KERNEL1x4_L2 128,32,11,0 - KERNEL1x4_L2 128,32,12,0 - KERNEL1x4_L2 128,32,13,0 - KERNEL1x4_L2 128,32,14,0 - KERNEL1x4_L2 128,32,15,1 - bdnz ZGEMM_L1x4_LOOP - MY_ALIGN - - -ZGEMM_L1x4_LOOP_END: -/*----------------------------------------*/ - END1x4_2 - blr - MY_ALIGN - - -ZGEMM_1x4_L16_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_L2 128,32,3,0 - KERNEL1x4_L2 128,32,4,0 - KERNEL1x4_L2 128,32,5,0 - KERNEL1x4_L2 128,32,6,0 - KERNEL1x4_E2 128,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x4_L8_SUB: -/*----------------------------------------*/ - LOAD1x4_2 - KERNEL1x4_L2 128,32,0,0 - KERNEL1x4_L2 128,32,1,0 - KERNEL1x4_L2 128,32,2,0 - KERNEL1x4_E2 128,32,3,1 - blr - - -ZGEMM_1x2_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x2_2 - MY_ALIGN - - -ZGEMM_L1x2_LOOP: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,0,0 - - -ZGEMM_L1x2_K32: -/*----------------------------------------*/ - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_L2 64,32,7,0 - KERNEL1x2_L2 64,32,8,0 - KERNEL1x2_L2 64,32,9,0 - KERNEL1x2_L2 64,32,10,0 - KERNEL1x2_L2 64,32,11,0 - KERNEL1x2_L2 64,32,12,0 - KERNEL1x2_L2 64,32,13,0 - KERNEL1x2_L2 64,32,14,0 - KERNEL1x2_L2 64,32,15,1 - bdnz ZGEMM_L1x2_LOOP - MY_ALIGN - - -ZGEMM_L1x2_LOOP_END: -/*----------------------------------------*/ - END1x2_2 - blr - MY_ALIGN - - -ZGEMM_1x2_L16_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_L2 64,32,3,0 - KERNEL1x2_L2 64,32,4,0 - KERNEL1x2_L2 64,32,5,0 - KERNEL1x2_L2 64,32,6,0 - KERNEL1x2_E2 64,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x2_L8_SUB: -/*----------------------------------------*/ - LOAD1x2_2 - KERNEL1x2_L2 64,32,0,0 - KERNEL1x2_L2 64,32,1,0 - KERNEL1x2_L2 64,32,2,0 - KERNEL1x2_E2 64,32,3,1 - blr - - -ZGEMM_1x1_LMAIN_SUB: -/*----------------------------------------*/ - mtctr T8 - LOAD1x1_2 - MY_ALIGN - - -ZGEMM_L1x1_LOOP: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,0,0 - - -ZGEMM_L1x1_K32: -/*----------------------------------------*/ - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_L2 32,32,7,0 - KERNEL1x1_L2 32,32,8,0 - KERNEL1x1_L2 32,32,9,0 - KERNEL1x1_L2 32,32,10,0 - KERNEL1x1_L2 32,32,11,0 - KERNEL1x1_L2 32,32,12,0 - KERNEL1x1_L2 32,32,13,0 - KERNEL1x1_L2 32,32,14,0 - KERNEL1x1_L2 32,32,15,1 - bdnz ZGEMM_L1x1_LOOP - MY_ALIGN - - -ZGEMM_L1x1_LOOP_END: -/*----------------------------------------*/ - END1x1_2 - blr - MY_ALIGN - - -ZGEMM_1x1_L16_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_L2 32,32,3,0 - KERNEL1x1_L2 32,32,4,0 - KERNEL1x1_L2 32,32,5,0 - KERNEL1x1_L2 32,32,6,0 - KERNEL1x1_E2 32,32,7,1 - blr - MY_ALIGN - - -ZGEMM_1x1_L8_SUB: -/*----------------------------------------*/ - LOAD1x1_2 - KERNEL1x1_L2 32,32,0,0 - KERNEL1x1_L2 32,32,1,0 - KERNEL1x1_L2 32,32,2,0 - KERNEL1x1_E2 32,32,3,1 - blr - - -/*----------------------N1 BEGINS---------*/ -ZGEMM_L1: -/*----------------------------------------*/ - andi. T1, N, 1 - ble ZGEMM_L1_END - -ZGEMM_L1_BEGIN: -/*----------------------------------------*/ - mr CO, C - - add T2,C,LDC - mr AO, A - add C, C, T1 -#if defined(TRMMKERNEL) && defined(LEFT) - mr TEMP_REG, OFFSET /*off = offset;*/ -#endif - srawi. I, M, 3 - ble ZGEMM_L1x8_END - dcbt CO,r0 /*just prefetch*/ - dcbt T2,r0 - - -ZGEMM_L1x8_BEGIN: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,8,1 -#else - mr BO, B - dcbt B, r0 -#endif - dcbt AO, r0 -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,8,1 - mr T1, T6 -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(T11-2) % 128x */ -#else - mr T1, K -/* TEMPS FOR PREFETCH */ - li T2, 1024 - li T3, 1024+512 - addi T1,T1, -2 -/* TEMPS FOR PREFETCH */ - li T4, 2048 - li T5, 2048+512 - srawi. T8, T1, 7 /**(K-2) % 128x */ -#endif - ZERO1x8 - ble ZGEMM_L1x8_SUB0 - bl ZGEMM_L1x8_LMAIN_SUB - andi. L, T1, 127 - ble ZGEMM_L1x8_SAVE - b ZGEMM_L1x8_SUB2 - - -ZGEMM_L1x8_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 255 - cmpwi T6,129 -#else - andi. L, K, 255 - cmpwi K,129 -#endif - li T8,1 - bne CMP1x8_128K - addi BO,BO,-16 - addi AO,AO,-128 - LOAD1x8O 128,16 - END1x8_WITHOUT_ADD - LOAD1x8_2O 256, 32 - mtctr T8 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - CMP1x8_128K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,128 -#else - cmpwi K,128 -#endif - bne ZGEMM_L1x8_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-256 - LOAD1x8_2O 256,32 - bl ZGEMM_L1x8_K128 - b ZGEMM_L1x8_SAVE - MY_ALIGN - - -ZGEMM_L1x8_SUB2: -/*----------------------------------------*/ - andi. T1,L, 64 - ble ZGEMM_L1x8_SUB2_32 - bl ZGEMM_1x8_L64_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_32: -/*----------------------------------------*/ - andi. T1,L, 32 - ble ZGEMM_L1x8_SUB2_16 - bl ZGEMM_1x8_L32_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_16: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x8_SUB2_8 - bl ZGEMM_1x8_L16_SUB - MY_ALIGN - - -ZGEMM_L1x8_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x8_SUB2_4 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_L2 256,32, 1,0 - KERNEL1x8_L2 256,32, 2,0 - KERNEL1x8_E2 256,32, 3,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x8_SUB2_2 - LOAD1x8_2 - KERNEL1x8_L2 256,32, 0,0 - KERNEL1x8_E2 256,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x8_SUB2_1 - LOAD1x8_2 - KERNEL1x8_E2 256,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x8_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x8_SAVE - KERNEL1x8 - - -ZGEMM_L1x8_SAVE: -/*----------------------------------------*/ - addic. I, I, -1 - SAVE1x8 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,8,1 -#endif - bgt ZGEMM_L1x8_BEGIN - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END - b ZGEMM_L1x4_BEGIN - MY_ALIGN - - -ZGEMM_L1x8_END: -/*----------------------------------------*/ - - -ZGEMM_L1x4_BEGIN: -/*----------------------------------------*/ - andi. T2, M, 7 - ble ZGEMM_L1x1_END - andi. T1, M, 4 - ble ZGEMM_L1x4_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,4,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,4,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x4 - ble ZGEMM_L1x4_SUB0 - bl ZGEMM_1x4_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x4_SAVE - b ZGEMM_L1x4_SUB2 - - -ZGEMM_L1x4_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x4_32K - addi BO,BO,-16 - addi AO,AO,-64 - LOAD1x4O 64,16 - END1x4_WITHOUT_ADD - LOAD1x4_2O 128, 32 - mtctr T8 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - CMP1x4_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x4_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-128 - LOAD1x4_2O 128,32 - bl ZGEMM_L1x4_K32 - b ZGEMM_L1x4_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x4_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x4_SUB2_8 - bl ZGEMM_1x4_L16_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x4_SUB2_4 - bl ZGEMM_1x4_L8_SUB - MY_ALIGN - - -ZGEMM_L1x4_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x4_SUB2_2 - LOAD1x4_2 - KERNEL1x4_L2 128,32, 0,0 - KERNEL1x4_E2 128,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x4_SUB2_1 - LOAD1x4_2 - KERNEL1x4_E2 128,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x4_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x4_SAVE - KERNEL1x4 - - -ZGEMM_L1x4_SAVE: -/*----------------------------------------*/ - SAVE1x4 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,4,1 -#endif - - -ZGEMM_L1x4_END: -/*----------------------------------------*/ - - -ZGEMM_L1x2_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 2 - ble ZGEMM_L1x2_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,2,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,2,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x2 - ble ZGEMM_L1x2_SUB0 - bl ZGEMM_1x2_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x2_SAVE - b ZGEMM_L1x2_SUB2 - - -ZGEMM_L1x2_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x2_32K - addi BO,BO,-16 - addi AO,AO,-32 - LOAD1x2O 32,16 - END1x2_WITHOUT_ADD - LOAD1x2_2O 64, 32 - mtctr T8 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - CMP1x2_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x2_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-64 - LOAD1x2_2O 64,32 - bl ZGEMM_L1x2_K32 - b ZGEMM_L1x2_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x2_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x2_SUB2_8 - bl ZGEMM_1x2_L16_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x2_SUB2_4 - bl ZGEMM_1x2_L8_SUB - MY_ALIGN - - -ZGEMM_L1x2_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x2_SUB2_2 - LOAD1x2_2 - KERNEL1x2_L2 64,32, 0,0 - KERNEL1x2_E2 64,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x2_SUB2_1 - LOAD1x2_2 - KERNEL1x2_E2 64,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x2_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x2_SAVE - KERNEL1x2 - - -ZGEMM_L1x2_SAVE: -/*----------------------------------------*/ - SAVE1x2 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,2,1 -#endif - - -ZGEMM_L1x2_END: -/*----------------------------------------*/ - - -ZGEMM_L1x1_BEGIN: -/*----------------------------------------*/ - andi. T1, M, 1 - ble ZGEMM_L1x1_END -#if defined(TRMMKERNEL) - REFRESH_POINTERS AO,BO,TEMP_REG,B,1,1 -#else - mr BO, B -#endif -#if defined(TRMMKERNEL) - REFRESH_TEMP_BK T6,K,TEMP_REG,1,1 - mr T1, T6 - addi T1,T1, -2 - srawi. T8, T1, 5 /**(T11-2) % 32x */ -#else - mr T1, K - addi T1,T1, -2 - srawi. T8, T1, 5 /**(K-2) % 32x */ -#endif - ZERO1x1 - ble ZGEMM_L1x1_SUB0 - bl ZGEMM_1x1_LMAIN_SUB - andi. L, T1, 31 - ble ZGEMM_L1x1_SAVE - b ZGEMM_L1x1_SUB2 - - -ZGEMM_L1x1_SUB0: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - andi. L, T6, 63 - cmpwi T6,33 -#else - andi. L, K, 63 - cmpwi K,33 -#endif - li T8,1 - bne CMP1x1_32K - addi BO,BO,-16 - addi AO,AO,-16 - LOAD1x1O 16,16 - END1x1_WITHOUT_ADD - LOAD1x1_2O 32, 32 - mtctr T8 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - CMP1x1_32K: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) - cmpwi T6,32 -#else - cmpwi K,32 -#endif - bne ZGEMM_L1x1_SUB2 - MY_ALIGN - mtctr T8 - addi BO,BO,-32 - addi AO,AO,-32 - LOAD1x1_2O 32,32 - bl ZGEMM_L1x1_K32 - b ZGEMM_L1x1_SAVE - MY_ALIGN - MY_ALIGN - - -ZGEMM_L1x1_SUB2: -/*----------------------------------------*/ - andi. T1,L, 16 - ble ZGEMM_L1x1_SUB2_8 - bl ZGEMM_1x1_L16_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_8: -/*----------------------------------------*/ - andi. T1,L, 8 - ble ZGEMM_L1x1_SUB2_4 - bl ZGEMM_1x1_L8_SUB - MY_ALIGN - - -ZGEMM_L1x1_SUB2_4: -/*----------------------------------------*/ - andi. T1,L, 4 - ble ZGEMM_L1x1_SUB2_2 - LOAD1x1_2 - KERNEL1x1_L2 32,32, 0,0 - KERNEL1x1_E2 32,32, 1,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_2: -/*----------------------------------------*/ - andi. T1,L, 2 - ble ZGEMM_L1x1_SUB2_1 - LOAD1x1_2 - KERNEL1x1_E2 32,32, 0,1 - MY_ALIGN - - -ZGEMM_L1x1_SUB2_1: -/*----------------------------------------*/ - andi. T1,L, 1 - ble ZGEMM_L1x1_SAVE - KERNEL1x1 - - -ZGEMM_L1x1_SAVE: -/*----------------------------------------*/ - SAVE1x1 -#if defined(TRMMKERNEL) - REFRESH_AFTER_SAVE T6,K,TEMP_REG,BO,AO,1,1 -#endif - - -ZGEMM_L1x1_END: -/*----------------------------------------*/ -#if defined(TRMMKERNEL) && !defined(LEFT) - addi TEMP_REG, TEMP_REG, 1 -#endif - - -ZGEMM_L1_END: -/*----------------------------------------*/ - \ No newline at end of file diff --git a/kernel/power/zgemm_macros_power9.S b/kernel/power/zgemm_macros_power9.S deleted file mode 100644 index 8670e9574..000000000 --- a/kernel/power/zgemm_macros_power9.S +++ /dev/null @@ -1,1825 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#define unit_size 16 -#define DISP32(ind,disp) (ind*unit_size*32+disp) -#define DISP16(ind,disp) (ind*unit_size*16+disp) -#define DISP8(ind,disp) (ind*unit_size*8+disp) -#define DISP4(ind,disp) (ind*unit_size*4+disp) -#define DISP2(ind,disp) (ind*unit_size*2+disp) -#define DISP1(ind,disp) (ind*unit_size+disp) -#define DISPX(disp) (disp) -/* HELPERS FOR SAVE */ -/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */ - - -.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET -#ifndef TRMMKERNEL - lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG) - lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG) - xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2 - xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2 -#endif -.endm -/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/ - - -.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/ -.endm -/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/ - - -.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */ - xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/ -.endm -/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/ - - -.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI -#if defined(NN) || defined(NT) || defined(TN) || defined(TT) - xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(CN) || defined(CT) || defined(RN) || defined(RT) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#elif defined(NC) || defined(TC) || defined(NR) || defined(TR) - xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR - xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2 -#else // CC || CR || RC || RR - /*we will assume {-alpha_r,-alpha_i} for this case */ - /*i1i2-r1r2 so we will negate alpha real instead to fix sign*/ - xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1 - /*we will negate alpha image instead instead to fix sign*/ - xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI -#endif -.endm -/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */ - - -.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2 -#ifndef TRMMKERNEL - xvmsubadp \VSOUT1,\VSINII, alpha_i - xvmaddadp \VSOUT2,\VSINRR, alpha_i -#else - xvmuldp \VSOUT1,\VSINII, alpha_i - xvmuldp \VSOUT2,\VSINRR, alpha_i -#endif -.endm -/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */ - - -.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2 - xvmsubadp \VSOUT1,\VSINRR, alpha_r - xvmaddadp \VSOUT2,\VSINII, alpha_r -.endm -/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */ - - -.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2 - xxmrghd \VSOUT1,\VSIN2,\VSIN1 - xxmrgld \VSOUT2,\VSIN2,\VSIN1 -.endm - - -.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2 - stxv \VSIN1, DISPX(\LOFFSET)(\REG) - stxv \VSIN2, DISPX(\LOFFSET+16)(\REG) -.endm - - -.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - LOAD_COUPLE_AS_RR_II vs24,vs25,vs18,vs19,\BASE_REG,(\LOFFSET +64) - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - LOAD_COUPLE_AS_RR_II vs26,vs27,vs20,vs21,\BASE_REG,(\LOFFSET+96) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs10,vs11 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs12,vs13 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4 - MULT_APLHA_PART1 vs6,vs8,vs16,vs17 - MULT_APLHA_PART2 vs2,vs4,vs14,vs15 - AGGREGATE_REALS_IMAGES vs10,vs11,vs12,vs13 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - MULT_APLHA_PART1 vs10,vs12, vs24,vs25 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - MULT_APLHA_PART1 \VSRes1,\VSRes3, vs26,vs27 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - MULT_APLHA_PART2 vs10,vs12,vs24,vs25 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 - MULT_APLHA_PART2 \VSRes1,\VSRes3, vs26,vs27 - UNPACK_FOR_STORE vs24,vs25,vs10,vs12 - UNPACK_FOR_STORE vs26,vs27,\VSRes1,\VSRes3 - STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs10,vs12 - STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3 -.endm - - -.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - LOAD_COUPLE_AS_RR_II vs16,vs17,vs20,vs21,\BASE_REG,(\LOFFSET+32) - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs6,vs7 - RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs8,vs9 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - AGGREGATE_REALS_IMAGES vs6,vs7,vs8,vs9 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART1 vs6,vs8, vs16,vs17 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs6,vs8,vs16,vs17 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - UNPACK_FOR_STORE vs16,vs17,vs3,vs5 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 - STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs3,vs5 -.endm - - - -.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs2,vs3 - LOAD_COUPLE_AS_RR_II vs14,vs15,vs18,vs19,\BASE_REG,\LOFFSET - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - STORE_COUPLE \BASE_REG,\LOFFSET,vs7,vs9 -.endm - - - -.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET - RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs2,vs3 -#ifndef TRMMKERNEL - lxv vs18, (\LOFFSET)(\BASE_REG) - xxmrgld vs14,vs18,vs18 - xxmrghd vs15,vs18,vs18 -#endif - RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs4,vs5 - AGGREGATE_REALS_IMAGES vs2,vs3,vs4,vs5 - MULT_APLHA_PART1 vs2,vs4, vs14,vs15 - MULT_APLHA_PART2 vs2,vs4, vs14,vs15 - UNPACK_FOR_STORE vs14,vs15,vs7,vs9 - xxmrghd vs7,vs15,vs14 - stxv vs7, (\LOFFSET)(\BASE_REG) -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=8 -**********************************************************************************************/ - -.macro Zero2x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 - xxlxor vs49, vs49, vs49 - xxlxor vs50, vs50, vs50 - xxlxor vs51, vs51, vs51 - xxlxor vs52, vs52, vs52 - xxlxor vs53, vs53, vs53 - xxlxor vs54, vs54, vs54 - xxlxor vs55, vs55, vs55 - xxlxor vs56, vs56, vs56 - xxlxor vs57, vs57, vs57 - xxlxor vs58, vs58, vs58 - xxlxor vs59, vs59, vs59 - xxlxor vs60, vs60, vs60 - xxlxor vs61, vs61, vs61 - xxlxor vs62, vs62, vs62 - xxlxor vs63, vs63, vs63 -.endm - - -.macro LOAD2x8 - LOAD2x8O 0,0 -.endm - - -.macro LOAD2x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x8_NORMAL - END2x8 AO,BO,128,32 -.endm - - -.macro END2x8_WITHOUT_ADD - END2x8 AO,BO,0,0 -.endm - - -.macro END2x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.endm - - -.macro LOAD2x8_2 - LOAD2x8_2O 0,0 -.endm - - -.macro LOAD2x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x8_2 - /*for load2 offset will be 256 and 64*/ - KERNEL2x8_2 AO,BO, 256,64,0 ,1,1 -.endm - - - -.macro KERNEL2x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs48, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs49, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs50, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs51, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs52, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs53, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs54, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs55, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs56, vs4, vs18 - xvmaddadp vs41, vs4, vs17 - xvmaddadp vs57, vs4, vs19 - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs58, vs5, vs18 - xvmaddadp vs43, vs5, vs17 - xvmaddadp vs59, vs5, vs19 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs60, vs6, vs18 - xvmaddadp vs45, vs6, vs17 - xvmaddadp vs61, vs6, vs19 - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs62, vs7, vs18 - xvmaddadp vs47, vs7, vs17 - xvmaddadp vs63, vs7, vs19 -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs48, vs8, vs22 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs49, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs50, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs51, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs52, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs53, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs54, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs55, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs56, vs12, vs22 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs57, vs12, vs23 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs58, vs13, vs22 - xvmaddadp vs43, vs13, vs21 - xvmaddadp vs59, vs13, vs23 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs60, vs14, vs22 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs61, vs14, vs23 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs62, vs15, vs22 - xvmaddadp vs47, vs15, vs21 - xvmaddadp vs63, vs15, vs23 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - - - -.macro KERNEL2x8 - LOAD2x8 - END2x8 AO, BO, 128,32 -.endm - - -.macro SAVE2x8 - add T1, CO ,LDC - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - SAVE8 vs48,vs49,vs50,vs51,vs52,vs53,vs54,vs55,vs56,vs57,vs58,vs59,vs60,vs61,vs62,vs63,T1,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero2x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 -.endm - - -.macro LOAD2x4 - LOAD2x4O 0,0 -.endm - - -.macro LOAD2x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_NORMAL - END2x4 AO,BO,64,32 -.endm - - -.macro END2x4_WITHOUT_ADD - END2x4 AO,BO,0,0 -.endm - - -.macro END2x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 - -.endm - - -.macro LOAD2x4_2 - LOAD2x4_2O 0,0 -.endm - - -.macro LOAD2x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x4_2 - /*for load2 offset will be 128 and 64*/ - KERNEL2x4_2 AO,BO, 128,64,0 ,1,1 -.endm - - - -.macro KERNEL2x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs40, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs41, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs42, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs43, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs44, vs2, vs18 - xvmaddadp vs37, vs2, vs17 - xvmaddadp vs45, vs2, vs19 - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs46, vs3, vs18 - xvmaddadp vs39, vs3, vs17 - xvmaddadp vs47, vs3, vs19 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs40, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs41, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs42, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs43, vs9, vs23 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs44, vs10, vs22 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs45, vs10, vs23 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs46, vs11, vs22 - xvmaddadp vs39, vs11, vs21 - xvmaddadp vs47, vs11, vs23 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x4 - LOAD2x4 - END2x4 AO, BO, 64,32 -.endm - - - -.macro SAVE2x4 - add T1, CO ,LDC - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - SAVE4 vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,T1,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero2x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - -.endm - - -.macro LOAD2x2 - LOAD2x2O 0,0 -.endm - - -.macro LOAD2x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_NORMAL - END2x2 AO,BO,32,32 -.endm - - -.macro END2x2_WITHOUT_ADD - END2x2 AO,BO,0,0 -.endm - - -.macro END2x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 - -.endm - - -.macro LOAD2x2_2 - LOAD2x2_2O 0,0 -.endm - - -.macro LOAD2x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END2x2_2 - /*for load2 offset will be 64 and 64*/ - KERNEL2x2_2 AO,BO, 64,64,0 ,1,1 -.endm - - - -.macro KERNEL2x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs36, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs37, vs0, vs19 - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs38, vs1, vs18 - xvmaddadp vs35, vs1, vs17 - xvmaddadp vs39, vs1, vs19 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs36, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs37, vs8, vs23 -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs38, vs9, vs22 - xvmaddadp vs35, vs9, vs21 - xvmaddadp vs39, vs9, vs23 -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x2 - LOAD2x2 - END2x2 AO, BO, 32,32 -.endm - - - -.macro SAVE2x2 - add T1, CO ,LDC - SAVE2 vs32,vs33,vs34,vs35,CO,0 - SAVE2 vs36,vs37,vs38,vs39,T1,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero2x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD2x1 - LOAD2x1O 0,0 -.endm - - -.macro LOAD2x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_NORMAL - END2x1 AO,BO,16,32 -.endm - - -.macro END2x1_WITHOUT_ADD - END2x1 AO,BO,0,0 -.endm - - -.macro END2x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.endm - - -.macro LOAD2x1_2 - LOAD2x1_2O 0,0 -.endm - - -.macro LOAD2x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs18, (\OffsetB+16)(BO) // load real,imag from B - lxv vs20, (\OffsetB+32)(BO) // load real,imag from B - lxv vs22, (\OffsetB+48)(BO) // load real,imag from B - xxswapd vs17, vs16 - xxswapd vs19, vs18 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END2x1_2 - /*for load2 offset will be 32 and 64*/ - KERNEL2x1_2 AO,BO, 32,64,0 ,1,1 -.endm - - - -.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xxswapd vs23, vs22 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs34, vs0, vs18 - xvmaddadp vs33, vs0, vs17 - xvmaddadp vs35, vs0, vs19 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B - lxv vs18, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 - xxswapd vs19, vs18 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs34, vs8, vs22 - xvmaddadp vs33, vs8, vs21 - xvmaddadp vs35, vs8, vs23 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B - lxv vs22, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP4(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP4(\Index,64) -.endif -.endif -.endm - - - -.macro KERNEL2x1 - LOAD2x1 - END2x1 AO, BO, 16,32 -.endm - - - -.macro SAVE2x1 - add T1, CO ,LDC - SAVE1 vs32,vs33,CO,0 - SAVE1 vs34,vs35,T1,0 - addi CO, CO, 16 -.endm - -/********************************************************************************************** -* - -.macros for N=1 and M=8 -**********************************************************************************************/ - - -.macro Zero1x8 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 - xxlxor vs40, vs40, vs40 - xxlxor vs41, vs41, vs41 - xxlxor vs42, vs42, vs42 - xxlxor vs43, vs43, vs43 - xxlxor vs44, vs44, vs44 - xxlxor vs45, vs45, vs45 - xxlxor vs46, vs46, vs46 - xxlxor vs47, vs47, vs47 - xxlxor vs48, vs48, vs48 -.endm - - -.macro LOAD1x8 - LOAD1x8O 0,0 -.endm - - -.macro LOAD1x8O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x8_NORMAL - END1x8 AO,BO,128,16 -.endm - - -.macro END1x8_WITHOUT_ADD - END1x8 AO,BO,0,0 -.endm - - -.macro END1x8 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 - - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 - -.endm - - -.macro LOAD1x8_2 - LOAD1x8_2O 0,0 -.endm - - -.macro LOAD1x8_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs4, (64+\OffsetA)(AO) // load real,imag from A - lxv vs5, (80+\OffsetA)(AO) // load real,imag from A - lxv vs6, (96+\OffsetA)(AO) // load real,imag from A - lxv vs7, (112+\OffsetA)(AO) // load real,imag from A - lxv vs8, (128+0+\OffsetA)(AO) // load real,imag from A - lxv vs9, (128+16+\OffsetA)(AO) // load real,imag from A - lxv vs10, (128+32+\OffsetA)(AO) // load real,imag from A - lxv vs11, (128+48+\OffsetA)(AO) // load real,imag from A - lxv vs12, (128+64+\OffsetA)(AO) // load real,imag from A - lxv vs13, (128+80+\OffsetA)(AO) // load real,imag from A - lxv vs14, (128+96+\OffsetA)(AO) // load real,imag from A - lxv vs15, (128+112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x8_2 - /*for load2 offset will be 256 and 32*/ - KERNEL1x8_2 AO,BO, 256,32,0 ,1,1 -.endm - - - -.macro KERNEL1x8_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x8_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x8_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x8_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP16(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP16(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP16(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP16(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs4, vs16 - xvmaddadp vs41, vs4, vs17 - - xvmaddadp vs42, vs5, vs16 - xvmaddadp vs43, vs5, vs17 -.if \Complete==0 - lxv vs4, DISP16(\Index,64+ \OffsetA)(\AREG) // load real,imag from A - lxv vs5, DISP16(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs6, vs16 - xvmaddadp vs45, vs6, vs17 - - xvmaddadp vs46, vs7, vs16 - xvmaddadp vs47, vs7, vs17 -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs6, DISP16(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs7, DISP16(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP16(\Index,128+ + \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP16(\Index,128+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP16(\Index,128+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP16(\Index,128+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs40, vs12, vs20 - xvmaddadp vs41, vs12, vs21 - xvmaddadp vs42, vs13, vs20 - xvmaddadp vs43, vs13, vs21 -.if \Complete==0 - lxv vs12, DISP16(\Index, 192 + \OffsetA)(\AREG) // load real,imag from A - lxv vs13, DISP16(\Index,192 +16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs44, vs14, vs20 - xvmaddadp vs45, vs14, vs21 - xvmaddadp vs46, vs15, vs20 - xvmaddadp vs47, vs15, vs21 -.if \Complete==0 - lxv vs14, DISP16(\Index,192 +32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs15, DISP16(\Index,192 +48 + \OffsetA)(\AREG) // load real,imag from A - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP16(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP16(\Index,256) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - - - -.macro KERNEL1x8 - LOAD1x8 - END1x8 AO, BO, 128,16 -.endm - - -.macro SAVE1x8 - SAVE8 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,vs40,vs41,vs42,vs43,vs44,vs45,vs46,vs47,CO,0 - addi CO, CO, 128 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=4 -**********************************************************************************************/ - - -.macro Zero1x4 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - xxlxor vs36, vs36, vs36 - xxlxor vs37, vs37, vs37 - xxlxor vs38, vs38, vs38 - xxlxor vs39, vs39, vs39 -.endm - - -.macro LOAD1x4 - LOAD1x4O 0,0 -.endm - - -.macro LOAD1x4O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x4_NORMAL - END1x4 AO,BO,64,16 -.endm - - -.macro END1x4_WITHOUT_ADD - END1x4 AO,BO,0,0 -.endm - - -.macro END1x4 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 - -.endm - - -.macro LOAD1x4_2 - LOAD1x4_2O 0,0 -.endm - - -.macro LOAD1x4_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs2, (32+\OffsetA)(AO) // load real,imag from A - lxv vs3, (48+\OffsetA)(AO) // load real,imag from A - lxv vs8, (64+\OffsetA)(AO) // load real,imag from A - lxv vs9, (80+\OffsetA)(AO) // load real,imag from A - lxv vs10, (96+\OffsetA)(AO) // load real,imag from A - lxv vs11, (112+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x4_2 - /*for load2 offset will be 128 and 32*/ - KERNEL1x4_2 AO,BO, 128,32,0 ,1,1 -.endm - - - -.macro KERNEL1x4_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x4_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x4_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x4_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP8(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP8(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs2, vs16 - xvmaddadp vs37, vs2, vs17 - - xvmaddadp vs38, vs3, vs16 - xvmaddadp vs39, vs3, vs17 -.if \Complete==0 - lxv vs2, DISP8(\Index,32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs3, DISP8(\Index,48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs8, DISP8(\Index,64+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP8(\Index,64+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - xvmaddadp vs36, vs10, vs20 - xvmaddadp vs37, vs10, vs21 - xvmaddadp vs38, vs11, vs20 - xvmaddadp vs39, vs11, vs21 -.if \Complete==0 - lxv vs10, DISP8(\Index,64+32 + \OffsetA)(\AREG) // load real,imag from A - lxv vs11, DISP8(\Index,64+48 + \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP8(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP8(\Index,128) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x4 - LOAD1x4 - END1x4 AO, BO, 64,16 -.endm - - - -.macro SAVE1x4 - SAVE4 vs32,vs33,vs34,vs35,vs36,vs37,vs38,vs39,CO,0 - addi CO, CO, 64 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=2 -**********************************************************************************************/ - - -.macro Zero1x2 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 - xxlxor vs34, vs34, vs34 - xxlxor vs35, vs35, vs35 - -.endm - - -.macro LOAD1x2 - LOAD1x2O 0,0 -.endm - - -.macro LOAD1x2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - -.endm - - -.macro END1x2_NORMAL - END1x2 AO,BO,32,16 -.endm - - -.macro END1x2_WITHOUT_ADD - END1x2 AO,BO,0,0 -.endm - - -.macro END1x2 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 - -.endm - - -.macro LOAD1x2_2 - LOAD1x2_2O 0,0 -.endm - - -.macro LOAD1x2_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs1, (16+\OffsetA)(AO) // load real,imag from A - lxv vs8, (32+\OffsetA)(AO) // load real,imag from A - lxv vs9, (48+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x2_2 - /*for load2 offset will be 64 and 32*/ - KERNEL1x2_2 AO,BO, 64,32,0 ,1,1 -.endm - - - -.macro KERNEL1x2_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x2_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x2_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x2_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 - xxswapd vs21, vs20 - xvmaddadp vs34, vs1, vs16 - xvmaddadp vs35, vs1, vs17 -.if \Complete==0 - lxv vs0, DISP4(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A - lxv vs1, DISP4(\Index,16 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs34, vs9, vs20 - xvmaddadp vs35, vs9, vs21 -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \Complete==0 - lxv vs8, DISP4(\Index,32+0+ \OffsetA)(\AREG) // load real,imag from A - lxv vs9, DISP4(\Index,32+16 + \OffsetA)(\AREG) // load real,imag from A -.endif - - - -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP4(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP4(\Index,64) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x2 - LOAD1x2 - END1x2 AO, BO, 32,16 -.endm - - - -.macro SAVE1x2 - SAVE2 vs32,vs33,vs34,vs35,CO,0 - addi CO, CO, 32 -.endm -/********************************************************************************************** -* - -.macros for N=2 and M=1 -**********************************************************************************************/ - - - -.macro Zero1x1 - xxlxor vs32, vs32, vs32 - xxlxor vs33, vs33, vs33 -.endm - - -.macro LOAD1x1 - LOAD1x1O 0,0 -.endm - - -.macro LOAD1x1O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - xxswapd vs17, vs16 - -.endm - - -.macro END1x1_NORMAL - END1x1 AO,BO,16,16 -.endm - - -.macro END1x1_WITHOUT_ADD - END1x1 AO,BO,0,0 -.endm - - -.macro END1x1 AREG, BREG, OffsetA, OffsetB -.if \OffsetB != 0 - addi \BREG, \BREG, \OffsetB -.endif -.if \OffsetA != 0 - addi \AREG, \AREG, \OffsetA -.endif - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.endm - - -.macro LOAD1x1_2 - LOAD1x1_2O 0,0 -.endm - - -.macro LOAD1x1_2O OffsetA,OffsetB - lxv vs16,(\OffsetB+ 0)(BO) // load real imag from B - lxv vs20, (\OffsetB+16)(BO) // load real,imag from B - xxswapd vs17, vs16 - - lxv vs0, (0+\OffsetA)(AO) // load real,imag from A - lxv vs8, (16+\OffsetA)(AO) // load real,imag from A -.endm - - -.macro END1x1_2 - /*for load2 offset will be 32 and 32*/ - KERNEL1x1_2 AO,BO, 32,32,0 ,1,1 -.endm - - - -.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1 -.endm - - -.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast - KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0 -.endm - - -.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete - xxswapd vs21, vs20 - xvmaddadp vs32, vs0, vs16 - xvmaddadp vs33, vs0, vs17 -.if \Complete==0 - lxv vs0, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A -.endif -.if \Complete==0 - lxv vs16, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B -.endif -.if \Complete==0 - xxswapd vs17, vs16 -.endif - xvmaddadp vs32, vs8, vs20 - xvmaddadp vs33, vs8, vs21 -.if \Complete==0 - lxv vs8, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A -.endif - -.if \Complete==0 - lxv vs20, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B -.endif -.if \IsLast==1 -.if \Complete==1 - addi \AREG, \AREG, DISP2(\Index,\OffsetA) - addi \BREG, \BREG, DISP2(\Index,\OffsetB) -.else - addi \AREG, \AREG, DISP2(\Index,32) - addi \BREG, \BREG, DISP2(\Index,32) -.endif -.endif -.endm - - - -.macro KERNEL1x1 - LOAD1x1 - END1x1 AO, BO, 16,16 -.endm - - - -.macro SAVE1x1 - SAVE1 vs32,vs33,CO,0 - addi CO, CO, 16 -.endm - -/****************************TRMM POINTER REFRESH - -.macroSES*************************/ - - -.macro SHIFT_REG REG1,REG2,SHIFT_VAL - .if \SHIFT_VAL==16 - slwi \REG1, \REG2, 8 - .elseif \SHIFT_VAL==8 - slwi \REG1, \REG2, 7 - .elseif \SHIFT_VAL==4 - slwi \REG1, \REG2, 6 - .elseif \SHIFT_VAL==2 - slwi \REG1, \REG2, 5 - .elseif \SHIFT_VAL==1 - slwi \REG1, \REG2, 4 - .endif -.endm -/* -//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// ptrbb = bb; -// #else -// ptrba += off*16; -// ptrbb = bb + off*2; -// #endif -*/ - - -.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B - #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /* ptrbb = bb;*/ - mr \PTR_B,\B_VAL /* refresh BPOINT */ - #else - /* - // ptrba =ptrba+ off*C_A; - // ptrbb = bb + off*C_B; - */ - SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */ - SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */ - add \PTR_B, \B_VAL , T4 /* Add values to BO */ - add \PTR_A, \PTR_A, T2 /* Add values to AO */ - #endif -.endm - -/* -// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) -// temp = bk-off; -// #elif defined(LEFT) -// temp = off+16; // number of values in A -// #else -// temp = off+2; // number of values in B -// #endif -*/ - - -.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B - #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - /* temp = bk-off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #elif defined(LEFT) - /* temp = off+INCR_A; // number of values in A */ - addi \TEMP_BK, \OFF_VAL, \INCR_A - #else - /* temp = off+INCR_B // number of values in B*/ - addi \TEMP_BK,\OFF_VAL, \INCR_B - #endif -.endm -/* -// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) -// temp = bk - off; -// #ifdef LEFT -// temp -= 16; // number of values in A -// #else -// temp -= 2; // number of values in B -// #endif -// ptrba += temp*16; -// ptrbb += temp*2; -// #endif -// #ifdef LEFT -// off += 16; // number of values in A -// #endif -*/ - - - -.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B - #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - /*temp = bk - off;*/ - sub \TEMP_BK,\BK_VAL,\OFF_VAL - #ifdef LEFT - /*temp -= 8; // number of values in A*/ - addi \TEMP_BK,\TEMP_BK,-\C_A - #else - /*temp -= 4; // number of values in B*/ - addi \TEMP_BK,\TEMP_BK,-\C_B - #endif - /*ptrba += temp*C_A; - ptrbb += temp*C_B;*/ - SHIFT_REG T4,\TEMP_BK,\C_A - SHIFT_REG T2,\TEMP_BK,\C_B - add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/ - add \PTR_B, \PTR_B,T2 - #endif - #ifdef LEFT - /*off += 8; // number of values in A*/ - addi \OFF_VAL,\OFF_VAL,\C_A - #endif -.endm \ No newline at end of file diff --git a/kernel/power/zgemv_n.S b/kernel/power/zgemv_n.S index 708f1318d..f93439986 100644 --- a/kernel/power/zgemv_n.S +++ b/kernel/power/zgemv_n.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -250,7 +250,7 @@ stw r22, 176(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zgemv_n_4.c b/kernel/power/zgemv_n_4.c index 167b0a158..8b250a7f1 100644 --- a/kernel/power/zgemv_n_4.c +++ b/kernel/power/zgemv_n_4.c @@ -389,14 +389,20 @@ static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { register __vector double va0_2 = vptr_a0[i + 2]; register __vector double va0_3 = vptr_a0[i + 3]; - register __vector double va0x = vec_xxpermdi(va0, va0, 2); - register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); - register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); - register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); - vy_0 += va0*vx0_r + va0x*vx0_i; - vy_1 += va0_1*vx0_r + va0x_1*vx0_i; - vy_2 += va0_2*vx0_r + va0x_2*vx0_i; - vy_3 += va0_3*vx0_r + va0x_3*vx0_i; + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; vy[i] = vy_0; vy[i + 1] = vy_1; diff --git a/kernel/power/zgemv_n_ppc440.S b/kernel/power/zgemv_n_ppc440.S index bd1148b65..55dd2d84f 100644 --- a/kernel/power/zgemv_n_ppc440.S +++ b/kernel/power/zgemv_n_ppc440.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -223,7 +223,7 @@ stw r22, 176(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t.S b/kernel/power/zgemv_t.S index d82fab16a..9c6f510c2 100644 --- a/kernel/power/zgemv_t.S +++ b/kernel/power/zgemv_t.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -226,7 +226,7 @@ stw r0, 4 + FZERO #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 20a0812dd..572206494 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -59,7 +59,11 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA i = 0; n = n << 1; while (i < n) { - +// __builtin_prefetch(&x[i]); +// __builtin_prefetch(&a0[i]); +// __builtin_prefetch(&a1[i]); +// __builtin_prefetch(&a2[i]); +// __builtin_prefetch(&a3[i]); register __vector double vx_0 = *(__vector double*) (&x[i]); register __vector double vx_1 = *(__vector double*) (&x[i + 2]); register __vector double vx_2 = *(__vector double*) (&x[i + 4]); diff --git a/kernel/power/zgemv_t_ppc440.S b/kernel/power/zgemv_t_ppc440.S index d7f3ee027..bfc039a0c 100644 --- a/kernel/power/zgemv_t_ppc440.S +++ b/kernel/power/zgemv_t_ppc440.S @@ -47,7 +47,7 @@ #define STACKSIZE 304 #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -179,7 +179,7 @@ stw r0, 4 + FZERO #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz INCY, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zger.S b/kernel/power/zger.S index 73757d448..a9a607815 100644 --- a/kernel/power/zger.S +++ b/kernel/power/zger.S @@ -47,7 +47,7 @@ #endif #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -235,7 +235,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz LDA, FRAMESLOT(0) + STACKSIZE(SP) lwz BUFFER, FRAMESLOT(1) + STACKSIZE(SP) diff --git a/kernel/power/zscal.S b/kernel/power/zscal.S index ae68ee672..2eb7b0df3 100644 --- a/kernel/power/zscal.S +++ b/kernel/power/zscal.S @@ -43,7 +43,7 @@ #define XX r4 #define PREA r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index a1b441d2c..14d677f24 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #pragma GCC optimize "O1" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif diff --git a/kernel/power/zscal_ppc440.S b/kernel/power/zscal_ppc440.S index 55dd1b87b..d0e4c9bcf 100644 --- a/kernel/power/zscal_ppc440.S +++ b/kernel/power/zscal_ppc440.S @@ -43,7 +43,7 @@ #define XX r4 #define PRE r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define X r6 #define INCX r7 diff --git a/kernel/power/zsum.S b/kernel/power/zsum.S deleted file mode 100644 index 8396012e8..000000000 --- a/kernel/power/zsum.S +++ /dev/null @@ -1,452 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N r3 -#define X r4 -#define INCX r5 - -#define INCXM1 r9 -#define PREA r8 - -#define FZERO f0 - -#define STACKSIZE 160 - - PROLOGUE - PROFCODE - - addi SP, SP, -STACKSIZE - li r0, 0 - - stfd f14, 0(SP) - stfd f15, 8(SP) - stfd f16, 16(SP) - stfd f17, 24(SP) - - stfd f18, 32(SP) - stfd f19, 40(SP) - stfd f20, 48(SP) - stfd f21, 56(SP) - - stfd f22, 64(SP) - stfd f23, 72(SP) - stfd f24, 80(SP) - stfd f25, 88(SP) - - stfd f26, 96(SP) - stfd f27, 104(SP) - stfd f28, 112(SP) - stfd f29, 120(SP) - - stfd f30, 128(SP) - stfd f31, 136(SP) - - stw r0, 144(SP) - lfs FZERO,144(SP) - -#ifdef F_INTERFACE - LDINT N, 0(N) - LDINT INCX, 0(INCX) -#endif - - slwi INCX, INCX, ZBASE_SHIFT - subi INCXM1, INCX, SIZE - - fmr f1, FZERO - fmr f2, FZERO - fmr f3, FZERO - fmr f4, FZERO - fmr f5, FZERO - fmr f6, FZERO - fmr f7, FZERO - - li PREA, L1_PREFETCHSIZE - - cmpwi cr0, N, 0 - ble- LL(999) - - cmpwi cr0, INCX, 0 - ble- LL(999) - - cmpwi cr0, INCX, 2 * SIZE - bne- cr0, LL(100) - - srawi. r0, N, 3 - mtspr CTR, r0 - beq- cr0, LL(50) - .align 4 - - LFD f8, 0 * SIZE(X) - LFD f9, 1 * SIZE(X) - LFD f10, 2 * SIZE(X) - LFD f11, 3 * SIZE(X) - LFD f12, 4 * SIZE(X) - LFD f13, 5 * SIZE(X) - LFD f14, 6 * SIZE(X) - LFD f15, 7 * SIZE(X) - - LFD f24, 8 * SIZE(X) - LFD f25, 9 * SIZE(X) - LFD f26, 10 * SIZE(X) - LFD f27, 11 * SIZE(X) - LFD f28, 12 * SIZE(X) - LFD f29, 13 * SIZE(X) - LFD f30, 14 * SIZE(X) - LFD f31, 15 * SIZE(X) - - fmr f16, f8 - fmr f17, f9 - fmr f18, f10 - fmr f19, f11 - - fmr f20, f12 - fmr f21, f13 - fmr f22, f14 - fmr f23, f15 - bdz LL(20) - .align 4 - -LL(10): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - LFD f8, 16 * SIZE(X) - LFD f9, 17 * SIZE(X) - LFD f10, 18 * SIZE(X) - LFD f11, 19 * SIZE(X) - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - LFD f12, 20 * SIZE(X) - LFD f13, 21 * SIZE(X) - LFD f14, 22 * SIZE(X) - LFD f15, 23 * SIZE(X) - - FADD f0, f0, f16 - fmr f16, f8 - FADD f1, f1, f17 - fmr f17, f9 - - FADD f2, f2, f18 - fmr f18, f10 - FADD f3, f3, f19 - fmr f19, f11 - - LFD f24, 24 * SIZE(X) - LFD f25, 25 * SIZE(X) - LFD f26, 26 * SIZE(X) - LFD f27, 27 * SIZE(X) - - FADD f4, f4, f20 - fmr f20, f12 - FADD f5, f5, f21 - fmr f21, f13 - - FADD f6, f6, f22 - fmr f22, f14 - FADD f7, f7, f23 - fmr f23, f15 - - LFD f28, 28 * SIZE(X) - LFD f29, 29 * SIZE(X) - LFD f30, 30 * SIZE(X) - LFD f31, 31 * SIZE(X) - -#ifndef POWER6 - L1_PREFETCH X, PREA -#endif - addi X, X, 16 * SIZE -#ifdef POWER6 - L1_PREFETCH X, PREA -#endif - - bdnz LL(10) - .align 4 - -LL(20): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - FADD f0, f0, f16 - FADD f1, f1, f17 - FADD f2, f2, f18 - FADD f3, f3, f19 - - FADD f4, f4, f20 - FADD f5, f5, f21 - FADD f6, f6, f22 - FADD f7, f7, f23 - addi X, X, 16 * SIZE - .align 4 - -LL(50): - andi. r0, N, 7 - mtspr CTR, r0 - beq LL(999) - .align 4 - -LL(60): - LFD f8, 0 * SIZE(X) - LFD f9, 1 * SIZE(X) - addi X, X, 2 * SIZE - - FADD f0, f0, f8 - FADD f1, f1, f9 - - bdnz LL(60) - b LL(999) - .align 4 - -LL(100): - sub X, X, INCXM1 - - srawi. r0, N, 3 - mtspr CTR, r0 - beq- LL(150) - - LFDX f8, X, INCXM1 - LFDUX f9, X, INCX - LFDX f10, X, INCXM1 - LFDUX f11, X, INCX - LFDX f12, X, INCXM1 - LFDUX f13, X, INCX - LFDX f14, X, INCXM1 - LFDUX f15, X, INCX - - LFDX f24, X, INCXM1 - LFDUX f25, X, INCX - LFDX f26, X, INCXM1 - LFDUX f27, X, INCX - LFDX f28, X, INCXM1 - LFDUX f29, X, INCX - LFDX f30, X, INCXM1 - LFDUX f31, X, INCX - - fmr f16, f8 - fmr f17, f9 - fmr f18, f10 - fmr f19, f11 - - fmr f20, f12 - fmr f21, f13 - fmr f22, f14 - fmr f23, f15 - bdz LL(120) - .align 4 - -LL(110): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - LFDX f8, X, INCXM1 - LFDUX f9, X, INCX - LFDX f10, X, INCXM1 - LFDUX f11, X, INCX - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - LFDX f12, X, INCXM1 - LFDUX f13, X, INCX - LFDX f14, X, INCXM1 - LFDUX f15, X, INCX - - FADD f0, f0, f16 - fmr f16, f8 - FADD f1, f1, f17 - fmr f17, f9 - - FADD f2, f2, f18 - fmr f18, f10 - FADD f3, f3, f19 - fmr f19, f11 - - LFDX f24, X, INCXM1 - LFDUX f25, X, INCX - LFDX f26, X, INCXM1 - LFDUX f27, X, INCX - - FADD f4, f4, f20 - fmr f20, f12 - FADD f5, f5, f21 - fmr f21, f13 - - FADD f6, f6, f22 - fmr f22, f14 - FADD f7, f7, f23 - fmr f23, f15 - - LFDX f28, X, INCXM1 - LFDUX f29, X, INCX - LFDX f30, X, INCXM1 - LFDUX f31, X, INCX - bdnz LL(110) - .align 4 - -LL(120): - FADD f0, f0, f16 - fmr f16, f24 - FADD f1, f1, f17 - fmr f17, f25 - - FADD f2, f2, f18 - fmr f18, f26 - FADD f3, f3, f19 - fmr f19, f27 - - FADD f4, f4, f20 - fmr f20, f28 - FADD f5, f5, f21 - fmr f21, f29 - - FADD f6, f6, f22 - fmr f22, f30 - FADD f7, f7, f23 - fmr f23, f31 - - FADD f0, f0, f16 - FADD f1, f1, f17 - FADD f2, f2, f18 - FADD f3, f3, f19 - - FADD f4, f4, f20 - FADD f5, f5, f21 - FADD f6, f6, f22 - FADD f7, f7, f23 - .align 4 - -LL(150): - andi. r0, N, 7 - mtspr CTR, r0 - beq LL(999) - .align 4 - -LL(160): - LFDX f8, X, INCXM1 - LFDUX f9, X, INCX - FADD f0, f0, f8 - FADD f1, f1, f9 - bdnz LL(160) - .align 4 - -LL(999): - FADD f0, f0, f1 - FADD f2, f2, f3 - FADD f4, f4, f5 - FADD f6, f6, f7 - - FADD f0, f0, f2 - FADD f4, f4, f6 - FADD f1, f0, f4 - - lfd f14, 0(SP) - lfd f15, 8(SP) - lfd f16, 16(SP) - lfd f17, 24(SP) - - lfd f18, 32(SP) - lfd f19, 40(SP) - lfd f20, 48(SP) - lfd f21, 56(SP) - - lfd f22, 64(SP) - lfd f23, 72(SP) - lfd f24, 80(SP) - lfd f25, 88(SP) - - lfd f26, 96(SP) - lfd f27, 104(SP) - lfd f28, 112(SP) - lfd f29, 120(SP) - - lfd f30, 128(SP) - lfd f31, 136(SP) - - addi SP, SP, STACKSIZE - blr - - EPILOGUE diff --git a/kernel/power/zswap.S b/kernel/power/zswap.S index 415164a2b..8befadca2 100644 --- a/kernel/power/zswap.S +++ b/kernel/power/zswap.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define N r3 #define X r6 @@ -117,7 +117,7 @@ stfd f30, 128(SP) stfd f31, 136(SP) -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld INCY, FRAMESLOT(0) + STACKSIZE(SP) #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 1d8826f41..5ec1eee2e 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(POWER8) || defined(POWER9) +#if defined(POWER8) #include "zswap_microk_power8.c" #endif diff --git a/kernel/power/zsymv_L.S b/kernel/power/zsymv_L.S index 9f00df072..b348e328f 100644 --- a/kernel/power/zsymv_L.S +++ b/kernel/power/zsymv_L.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define N r4 @@ -259,7 +259,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/zsymv_U.S b/kernel/power/zsymv_U.S index fe97fde8b..b631cbe35 100644 --- a/kernel/power/zsymv_U.S +++ b/kernel/power/zsymv_U.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define M r3 #define IS r4 @@ -256,7 +256,7 @@ stw r27, 196(SP) #endif -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz BUFFER, FRAMESLOT(0) + STACKSIZE(SP) #else diff --git a/kernel/power/ztrmm_kernel_8x2_power8.S b/kernel/power/ztrmm_kernel_8x2_power8.S index 684cbd6eb..c1415138c 100644 --- a/kernel/power/ztrmm_kernel_8x2_power8.S +++ b/kernel/power/ztrmm_kernel_8x2_power8.S @@ -98,7 +98,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -259,7 +259,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. stfd f2, ALPHA_I_SP stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -280,7 +280,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #ifdef TRMMKERNEL -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_LN.S b/kernel/power/ztrsm_kernel_LN.S index 3acd9562d..87473b45d 100644 --- a/kernel/power/ztrsm_kernel_LN.S +++ b/kernel/power/ztrsm_kernel_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -244,7 +244,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_LT.S b/kernel/power/ztrsm_kernel_LT.S index 2d4f31189..db0860124 100644 --- a/kernel/power/ztrsm_kernel_LT.S +++ b/kernel/power/ztrsm_kernel_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_RT.S b/kernel/power/ztrsm_kernel_RT.S index 605363119..c50ab86df 100644 --- a/kernel/power/ztrsm_kernel_RT.S +++ b/kernel/power/ztrsm_kernel_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -166,7 +166,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -186,7 +186,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -247,7 +247,7 @@ #endif #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_LN.S b/kernel/power/ztrsm_kernel_cell_LN.S index 4798b5958..884a3e864 100644 --- a/kernel/power/ztrsm_kernel_cell_LN.S +++ b/kernel/power/ztrsm_kernel_cell_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_cell_LT.S b/kernel/power/ztrsm_kernel_cell_LT.S index 654938a4d..388dfe3c2 100644 --- a/kernel/power/ztrsm_kernel_cell_LT.S +++ b/kernel/power/ztrsm_kernel_cell_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif @@ -246,7 +246,7 @@ li PREA, 16 * 12 * SIZE #else -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ lwz PREA, FRAMESLOT(2) + STACKSIZE(SP) lwz PREC, FRAMESLOT(3) + STACKSIZE(SP) diff --git a/kernel/power/ztrsm_kernel_cell_RT.S b/kernel/power/ztrsm_kernel_cell_RT.S index e3fe84d00..00b50fe04 100644 --- a/kernel/power/ztrsm_kernel_cell_RT.S +++ b/kernel/power/ztrsm_kernel_cell_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -172,7 +172,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -192,7 +192,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_hummer_LN.S b/kernel/power/ztrsm_kernel_hummer_LN.S index 042f4d476..bf3eafa45 100644 --- a/kernel/power/ztrsm_kernel_hummer_LN.S +++ b/kernel/power/ztrsm_kernel_hummer_LN.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_LT.S b/kernel/power/ztrsm_kernel_hummer_LT.S index fc8a0bef8..865c85f78 100644 --- a/kernel/power/ztrsm_kernel_hummer_LT.S +++ b/kernel/power/ztrsm_kernel_hummer_LT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_hummer_RT.S b/kernel/power/ztrsm_kernel_hummer_RT.S index 17e31ffa8..99868f948 100644 --- a/kernel/power/ztrsm_kernel_hummer_RT.S +++ b/kernel/power/ztrsm_kernel_hummer_RT.S @@ -48,7 +48,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #define A r6 #define B r7 #define C r8 diff --git a/kernel/power/ztrsm_kernel_power6_LN.S b/kernel/power/ztrsm_kernel_power6_LN.S index 3c40f605a..65b8077db 100644 --- a/kernel/power/ztrsm_kernel_power6_LN.S +++ b/kernel/power/ztrsm_kernel_power6_LN.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_LT.S b/kernel/power/ztrsm_kernel_power6_LT.S index b2a92301d..c27170604 100644 --- a/kernel/power/ztrsm_kernel_power6_LT.S +++ b/kernel/power/ztrsm_kernel_power6_LT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_power6_RT.S b/kernel/power/ztrsm_kernel_power6_RT.S index cf37b5ca0..ff0338cdc 100644 --- a/kernel/power/ztrsm_kernel_power6_RT.S +++ b/kernel/power/ztrsm_kernel_power6_RT.S @@ -57,7 +57,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -184,7 +184,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -204,7 +204,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LN.S b/kernel/power/ztrsm_kernel_ppc440_LN.S index f0be64d81..d33522456 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LN.S +++ b/kernel/power/ztrsm_kernel_ppc440_LN.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_LT.S b/kernel/power/ztrsm_kernel_ppc440_LT.S index d5ff1b57f..a9e7b891f 100644 --- a/kernel/power/ztrsm_kernel_ppc440_LT.S +++ b/kernel/power/ztrsm_kernel_ppc440_LT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/power/ztrsm_kernel_ppc440_RT.S b/kernel/power/ztrsm_kernel_ppc440_RT.S index b77dd76d1..43f4b07cb 100644 --- a/kernel/power/ztrsm_kernel_ppc440_RT.S +++ b/kernel/power/ztrsm_kernel_ppc440_RT.S @@ -61,7 +61,7 @@ #define N r4 #define K r5 -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifndef __64BIT__ #define A r6 #define B r7 @@ -177,7 +177,7 @@ stw r0, FZERO -#if defined(linux) || defined(__FreeBSD__) +#ifdef linux #ifdef __64BIT__ ld LDC, FRAMESLOT(0) + STACKSIZE(SP) #endif @@ -197,7 +197,7 @@ #endif #endif -#if (defined(linux) || defined(__FreeBSD__)) && defined(__64BIT__) +#if defined(linux) && defined(__64BIT__) ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) #endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 8e8214e70..6d4028b0b 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -70,7 +70,7 @@ gotoblas_t TABLE_NAME = { samax_kTS, samin_kTS, smax_kTS, smin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, - snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, + snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, sgemv_nTS, sgemv_tTS, sger_kTS, @@ -126,7 +126,7 @@ gotoblas_t TABLE_NAME = { damax_kTS, damin_kTS, dmax_kTS, dmin_kTS, idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS, - dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS, + dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS, drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS, dgemv_nTS, dgemv_tTS, dger_kTS, dsymv_LTS, dsymv_UTS, @@ -178,7 +178,7 @@ gotoblas_t TABLE_NAME = { qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS, iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS, - qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS, + qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS, qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS, qgemv_nTS, qgemv_tTS, qger_kTS, qsymv_LTS, qsymv_UTS, @@ -234,7 +234,7 @@ gotoblas_t TABLE_NAME = { #endif camax_kTS, camin_kTS, icamax_kTS, icamin_kTS, - cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS, + cnrm2_kTS, casum_kTS, ccopy_kTS, cdotu_kTS, cdotc_kTS, csrot_kTS, caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS, @@ -369,7 +369,7 @@ gotoblas_t TABLE_NAME = { #endif zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS, - znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS, + znrm2_kTS, zasum_kTS, zcopy_kTS, zdotu_kTS, zdotc_kTS, zdrot_kTS, zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS, @@ -500,7 +500,7 @@ gotoblas_t TABLE_NAME = { XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N), xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS, - xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS, + xnrm2_kTS, xasum_kTS, xcopy_kTS, xdotu_kTS, xdotc_kTS, xqrot_kTS, xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS, @@ -718,27 +718,6 @@ static void init_parameter(void) { } #else // defined(ARCH_ARM64) -#if defined(ARCH_POWER) -static void init_parameter(void) { - - TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; - TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; - TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; - TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; - - TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; - TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; - TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; - TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; - - - TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; - TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; - TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; - TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; -} -#else //POWER - #ifdef ARCH_X86 static int get_l2_size_old(void){ int i, eax, ebx, ecx, edx, cpuid_level; @@ -1324,5 +1303,4 @@ static void init_parameter(void) { } -#endif //POWER #endif //defined(ARCH_ARM64) diff --git a/kernel/sparc/sum.S b/kernel/sparc/sum.S deleted file mode 100644 index f26abb85f..000000000 --- a/kernel/sparc/sum.S +++ /dev/null @@ -1,325 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N %i0 -#define X %i1 -#define INCX %i2 -#define I %i3 - -#ifdef DOUBLE -#define c1 %f0 -#define c2 %f2 -#define t1 %f8 -#define t2 %f10 -#define t3 %f12 -#define t4 %f14 - -#define a1 %f16 -#define a2 %f18 -#define a3 %f20 -#define a4 %f22 -#define a5 %f24 -#define a6 %f26 -#define a7 %f28 -#define a8 %f30 -#else -#define c1 %f0 -#define c2 %f1 -#define t1 %f4 -#define t2 %f5 -#define t3 %f6 -#define t4 %f7 - -#define a1 %f8 -#define a2 %f9 -#define a3 %f10 -#define a4 %f11 -#define a5 %f12 -#define a6 %f13 -#define a7 %f14 -#define a8 %f15 -#endif - - PROLOGUE - SAVESP - - FCLR(0) - - sll INCX, BASE_SHIFT, INCX - - FMOV c1, c2 - FMOV c1, t1 - FMOV c1, t2 - FMOV c1, t3 - FMOV c1, t4 - - cmp INCX, 0 - ble .LL19 - cmp INCX, SIZE - bne .LL50 - - sra N, 3, I - cmp I, 0 - ble,pn %icc, .LL15 - nop - - LDF [X + 0 * SIZE], a1 - add I, -1, I - LDF [X + 1 * SIZE], a2 - cmp I, 0 - LDF [X + 2 * SIZE], a3 - LDF [X + 3 * SIZE], a4 - LDF [X + 4 * SIZE], a5 - LDF [X + 5 * SIZE], a6 - LDF [X + 6 * SIZE], a7 - LDF [X + 7 * SIZE], a8 - - ble,pt %icc, .LL12 - add X, 8 * SIZE, X - -#define PREFETCHSIZE 128 - -.LL11: - FADD c1, t1, c1 - prefetch [X + PREFETCHSIZE * SIZE], 0 - FMOV a1, t1 - LDF [X + 0 * SIZE], a1 - - FADD c2, t2, c2 - add I, -1, I - FMOV a2, t2 - LDF [X + 1 * SIZE], a2 - - FADD c1, t3, c1 - cmp I, 0 - FMOV a3, t3 - LDF [X + 2 * SIZE], a3 - - FADD c2, t4, c2 - nop - FMOV a4, t4 - LDF [X + 3 * SIZE], a4 - - FADD c1, t1, c1 - nop - FMOV a5, t1 - LDF [X + 4 * SIZE], a5 - - FADD c2, t2, c2 - nop - FMOV a6, t2 - LDF [X + 5 * SIZE], a6 - - FADD c1, t3, c1 - FMOV a7, t3 - LDF [X + 6 * SIZE], a7 - add X, 8 * SIZE, X - - FADD c2, t4, c2 - FMOV a8, t4 - bg,pt %icc, .LL11 - LDF [X - 1 * SIZE], a8 - -.LL12: - FADD c1, t1, c1 - FMOV a1, t1 - FADD c2, t2, c2 - FMOV a2, t2 - - FADD c1, t3, c1 - FMOV a3, t3 - FADD c2, t4, c2 - FMOV a4, t4 - - FADD c1, t1, c1 - FMOV a5, t1 - FADD c2, t2, c2 - FMOV a6, t2 - - FADD c1, t3, c1 - FMOV a7, t3 - FADD c2, t4, c2 - FMOV a8, t4 - -.LL15: - and N, 7, I - cmp I, 0 - ble,a,pn %icc, .LL19 - nop - -.LL16: - LDF [X + 0 * SIZE], a1 - add I, -1, I - cmp I, 0 - FADD c1, t1, c1 - FMOV a1, t1 - bg,pt %icc, .LL16 - add X, 1 * SIZE, X - -.LL19: - FADD c1, t1, c1 - FADD c2, t2, c2 - FADD c1, t3, c1 - FADD c2, t4, c2 - - FADD c1, c2, c1 - return %i7 + 8 - clr %g0 - -.LL50: - sra N, 3, I - cmp I, 0 - ble,pn %icc, .LL55 - nop - - LDF [X + 0 * SIZE], a1 - add X, INCX, X - LDF [X + 0 * SIZE], a2 - add X, INCX, X - LDF [X + 0 * SIZE], a3 - add X, INCX, X - LDF [X + 0 * SIZE], a4 - add X, INCX, X - LDF [X + 0 * SIZE], a5 - add X, INCX, X - LDF [X + 0 * SIZE], a6 - add X, INCX, X - add I, -1, I - LDF [X + 0 * SIZE], a7 - cmp I, 0 - add X, INCX, X - LDF [X + 0 * SIZE], a8 - - ble,pt %icc, .LL52 - add X, INCX, X - -.LL51: - FADD c1, t1, c1 - add I, -1, I - FMOV a1, t1 - LDF [X + 0 * SIZE], a1 - add X, INCX, X - - FADD c2, t2, c2 - cmp I, 0 - FMOV a2, t2 - LDF [X + 0 * SIZE], a2 - add X, INCX, X - - FADD c1, t3, c1 - FMOV a3, t3 - LDF [X + 0 * SIZE], a3 - add X, INCX, X - - FADD c2, t4, c2 - FMOV a4, t4 - LDF [X + 0 * SIZE], a4 - add X, INCX, X - - FADD c1, t1, c1 - FMOV a5, t1 - LDF [X + 0 * SIZE], a5 - add X, INCX, X - - FADD c2, t2, c2 - FMOV a6, t2 - LDF [X + 0 * SIZE], a6 - add X, INCX, X - - FADD c1, t3, c1 - FMOV a7, t3 - LDF [X + 0 * SIZE], a7 - add X, INCX, X - - FADD c2, t4, c2 - FMOV a8, t4 - LDF [X + 0 * SIZE], a8 - - bg,pt %icc, .LL51 - add X, INCX, X - -.LL52: - FADD c1, t1, c1 - FMOV a1, t1 - FADD c2, t2, c2 - FMOV a2, t2 - - FADD c1, t3, c1 - FMOV a3, t3 - FADD c2, t4, c2 - FMOV a4, t4 - - FADD c1, t1, c1 - FMOV a5, t1 - FADD c2, t2, c2 - FMOV a6, t2 - - FADD c1, t3, c1 - FMOV a7, t3 - FADD c2, t4, c2 - FMOV a8, t4 - -.LL55: - and N, 7, I - cmp I, 0 - ble,a,pn %icc, .LL59 - nop - -.LL56: - LDF [X + 0 * SIZE], a1 - FADD c1, t1, c1 - add I, -1, I - FMOV a1, t1 - cmp I, 0 - bg,pt %icc, .LL56 - add X, INCX, X - -.LL59: - FADD c1, t1, c1 - FADD c2, t2, c2 - FADD c1, t3, c1 - FADD c2, t4, c2 - - FADD c1, c2, c1 - return %i7 + 8 - clr %o0 - - EPILOGUE diff --git a/kernel/sparc/zsum.S b/kernel/sparc/zsum.S deleted file mode 100644 index bc167dc72..000000000 --- a/kernel/sparc/zsum.S +++ /dev/null @@ -1,327 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define N %i0 -#define X %i1 -#define INCX %i2 -#define I %i3 - -#ifdef DOUBLE -#define c1 %f0 -#define c2 %f2 -#define t1 %f8 -#define t2 %f10 -#define t3 %f12 -#define t4 %f14 - -#define a1 %f16 -#define a2 %f18 -#define a3 %f20 -#define a4 %f22 -#define a5 %f24 -#define a6 %f26 -#define a7 %f28 -#define a8 %f30 -#else -#define c1 %f0 -#define c2 %f1 -#define t1 %f4 -#define t2 %f5 -#define t3 %f6 -#define t4 %f7 - -#define a1 %f8 -#define a2 %f9 -#define a3 %f10 -#define a4 %f11 -#define a5 %f12 -#define a6 %f13 -#define a7 %f14 -#define a8 %f15 -#endif - - PROLOGUE - SAVESP - - FCLR(0) - - sll INCX, ZBASE_SHIFT, INCX - - FMOV c1, c2 - FMOV c1, t1 - FMOV c1, t2 - FMOV c1, t3 - FMOV c1, t4 - - cmp INCX, 0 - ble .LL19 - nop - - cmp INCX, 2 * SIZE - bne .LL50 - nop - - sra N, 2, I - cmp I, 0 - ble,pn %icc, .LL15 - nop - - LDF [X + 0 * SIZE], a1 - add I, -1, I - LDF [X + 1 * SIZE], a2 - cmp I, 0 - LDF [X + 2 * SIZE], a3 - LDF [X + 3 * SIZE], a4 - LDF [X + 4 * SIZE], a5 - LDF [X + 5 * SIZE], a6 - LDF [X + 6 * SIZE], a7 - LDF [X + 7 * SIZE], a8 - - ble,pt %icc, .LL12 - add X, 8 * SIZE, X - -#define PREFETCHSIZE 32 - -.LL11: - FADD c1, t1, c1 - prefetch [X + PREFETCHSIZE * SIZE], 0 - FMOV a1, t1 - LDF [X + 0 * SIZE], a1 - - FADD c2, t2, c2 - add I, -1, I - FMOV a2, t2 - LDF [X + 1 * SIZE], a2 - - FADD c1, t3, c1 - cmp I, 0 - FMOV a3, t3 - LDF [X + 2 * SIZE], a3 - - FADD c2, t4, c2 - nop - FMOV a4, t4 - LDF [X + 3 * SIZE], a4 - - FADD c1, t1, c1 - nop - FMOV a5, t1 - LDF [X + 4 * SIZE], a5 - - FADD c2, t2, c2 - nop - FMOV a6, t2 - LDF [X + 5 * SIZE], a6 - - FADD c1, t3, c1 - FMOV a7, t3 - LDF [X + 6 * SIZE], a7 - add X, 8 * SIZE, X - - FADD c2, t4, c2 - FMOV a8, t4 - bg,pt %icc, .LL11 - LDF [X - 1 * SIZE], a8 - -.LL12: - FADD c1, t1, c1 - FMOV a1, t1 - FADD c2, t2, c2 - FMOV a2, t2 - - FADD c1, t3, c1 - FMOV a3, t3 - FADD c2, t4, c2 - FMOV a4, t4 - - FADD c1, t1, c1 - FMOV a5, t1 - FADD c2, t2, c2 - FMOV a6, t2 - - FADD c1, t3, c1 - FMOV a7, t3 - FADD c2, t4, c2 - FMOV a8, t4 - -.LL15: - and N, 3, I - cmp I, 0 - ble,a,pn %icc, .LL19 - nop - -.LL16: - LDF [X + 0 * SIZE], a1 - LDF [X + 1 * SIZE], a2 - add I, -1, I - cmp I, 0 - FADD c1, t1, c1 - FADD c2, t2, c2 - FMOV a1, t1 - FMOV a2, t2 - bg,pt %icc, .LL16 - add X, 2 * SIZE, X - -.LL19: - FADD c1, t1, c1 - FADD c2, t2, c2 - FADD c1, t3, c1 - FADD c2, t4, c2 - - FADD c1, c2, c1 - return %i7 + 8 - clr %g0 - -.LL50: - sra N, 2, I - cmp I, 0 - ble,pn %icc, .LL55 - nop - - LDF [X + 0 * SIZE], a1 - LDF [X + 1 * SIZE], a2 - add X, INCX, X - LDF [X + 0 * SIZE], a3 - LDF [X + 1 * SIZE], a4 - add X, INCX, X - LDF [X + 0 * SIZE], a5 - LDF [X + 1 * SIZE], a6 - add X, INCX, X - add I, -1, I - LDF [X + 0 * SIZE], a7 - cmp I, 0 - LDF [X + 1 * SIZE], a8 - - ble,pt %icc, .LL52 - add X, INCX, X - -.LL51: - FADD c1, t1, c1 - add I, -1, I - FMOV a1, t1 - LDF [X + 0 * SIZE], a1 - - FADD c2, t2, c2 - cmp I, 0 - FMOV a2, t2 - LDF [X + 1 * SIZE], a2 - add X, INCX, X - - FADD c1, t3, c1 - FMOV a3, t3 - LDF [X + 0 * SIZE], a3 - - FADD c2, t4, c2 - FMOV a4, t4 - LDF [X + 1 * SIZE], a4 - add X, INCX, X - - FADD c1, t1, c1 - FMOV a5, t1 - LDF [X + 0 * SIZE], a5 - - FADD c2, t2, c2 - FMOV a6, t2 - LDF [X + 1 * SIZE], a6 - add X, INCX, X - - FADD c1, t3, c1 - FMOV a7, t3 - LDF [X + 0 * SIZE], a7 - - FADD c2, t4, c2 - FMOV a8, t4 - LDF [X + 1 * SIZE], a8 - - bg,pt %icc, .LL51 - add X, INCX, X - -.LL52: - FADD c1, t1, c1 - FMOV a1, t1 - FADD c2, t2, c2 - FMOV a2, t2 - - FADD c1, t3, c1 - FMOV a3, t3 - FADD c2, t4, c2 - FMOV a4, t4 - - FADD c1, t1, c1 - FMOV a5, t1 - FADD c2, t2, c2 - FMOV a6, t2 - - FADD c1, t3, c1 - FMOV a7, t3 - FADD c2, t4, c2 - FMOV a8, t4 - -.LL55: - and N, 3, I - cmp I, 0 - ble,a,pn %icc, .LL59 - nop - -.LL56: - LDF [X + 0 * SIZE], a1 - LDF [X + 1 * SIZE], a2 - FADD c1, t1, c1 - FADD c2, t2, c2 - add I, -1, I - FMOV a1, t1 - FMOV a2, t2 - cmp I, 0 - bg,pt %icc, .LL56 - add X, INCX, X - -.LL59: - FADD c1, t1, c1 - FADD c2, t2, c2 - FADD c1, t3, c1 - FADD c2, t4, c2 - - FADD c1, c2, c1 - - return %i7 + 8 - clr %o0 - - EPILOGUE diff --git a/kernel/x86/KERNEL.generic b/kernel/x86/KERNEL.generic index 0aac0ce99..672edb069 100644 --- a/kernel/x86/KERNEL.generic +++ b/kernel/x86/KERNEL.generic @@ -94,11 +94,6 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c -SSUMKERNEL = ../arm/sum.c -DSUMKERNEL = ../arm/sum.c -CSUMKERNEL = ../arm/zsum.c -ZSUMKERNEL = ../arm/zsum.c - SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/x86/sum.S b/kernel/x86/sum.S deleted file mode 100644 index b24f34c8b..000000000 --- a/kernel/x86/sum.S +++ /dev/null @@ -1,207 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACK 8 -#define ARGS 0 - -#define STACK_M 4 + STACK + ARGS(%esp) -#define STACK_X 8 + STACK + ARGS(%esp) -#define STACK_INCX 12 + STACK + ARGS(%esp) - -#define M %edx -#define X %ecx -#define INCX %esi - -#define I %eax - -#include "l1param.h" - - PROLOGUE - - pushl %esi - pushl %ebx - - PROFCODE - -#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) - EMMS -#endif - - movl STACK_M, M - movl STACK_X, X - movl STACK_INCX, INCX - -#ifdef F_INTERFACE - movl (M), M - movl (INCX), INCX -#endif - - fldz - testl M, M - jle .L999 - testl INCX, INCX - jle .L999 - - sall $BASE_SHIFT, INCX - fldz - fldz - fldz - cmpl $SIZE, INCX - jne .L40 - - movl M, I - sarl $3, I - jle .L20 - ALIGN_4 - -.L10: -#ifdef PREFETCH - PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) -#endif - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - FLD 2 * SIZE(X) - FLD 3 * SIZE(X) - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 4 * SIZE(X) - FLD 5 * SIZE(X) - FLD 6 * SIZE(X) - FLD 7 * SIZE(X) - - addl $8 * SIZE, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decl I - jg .L10 - ALIGN_4 - -.L20: - movl M, I - andl $7, I - jle .L998 - ALIGN_4 - - -.L21: - FLD (X) - faddp %st,%st(1) - addl $1 * SIZE, X - decl I - jg .L21 - jmp .L998 - ALIGN_4 - -.L40: - movl M, I - sarl $3, I - jle .L60 - ALIGN_4 - -.L50: - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - FLD (X) - addl INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decl I - jg .L50 - ALIGN_4 - -.L60: - movl M, I - andl $7, I - jle .L998 - ALIGN_4 - - -.L61: - FLD (X) - addl INCX, X - faddp %st,%st(1) - decl I - jg .L61 - ALIGN_4 - -.L998: - faddp %st,%st(2) - faddp %st,%st(1) - faddp %st,%st(1) - ALIGN_4 - -.L999: - popl %ebx - popl %esi - ret - - EPILOGUE diff --git a/kernel/x86/zsum.S b/kernel/x86/zsum.S deleted file mode 100644 index cd2ce61db..000000000 --- a/kernel/x86/zsum.S +++ /dev/null @@ -1,208 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define STACK 8 -#define ARGS 0 - -#define STACK_M 4 + STACK + ARGS(%esp) -#define STACK_X 8 + STACK + ARGS(%esp) -#define STACK_INCX 12 + STACK + ARGS(%esp) - -#define M %edx -#define X %ecx -#define INCX %esi - -#define I %eax - -#include "l1param.h" - - PROLOGUE - - pushl %esi - pushl %ebx - - PROFCODE - -#if defined(F_INTERFACE_GFORT) || defined(F_INTERFACE_G95) - EMMS -#endif - - movl STACK_M, M - movl STACK_X, X - movl STACK_INCX, INCX - -#ifdef F_INTERFACE - movl (M), M - movl (INCX), INCX -#endif - - fldz - testl M, M - jle .L999 - testl INCX, INCX - jle .L999 - - sall $ZBASE_SHIFT, INCX - - fldz - fldz - fldz - cmpl $SIZE * 2, INCX - jne .L40 - - movl M, I - sarl $2, I - jle .L20 - ALIGN_4 - -.L10: -#ifdef PREFETCH - PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) -#endif - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - FLD 2 * SIZE(X) - FLD 3 * SIZE(X) - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 4 * SIZE(X) - FLD 5 * SIZE(X) - FLD 6 * SIZE(X) - FLD 7 * SIZE(X) - - addl $8 * SIZE, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decl I - jg .L10 - ALIGN_4 - -.L20: - movl M, I - andl $3, I - jle .L998 - ALIGN_4 - - -.L21: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - faddp %st,%st(3) - faddp %st,%st(1) - addl $2 * SIZE, X - decl I - jg .L21 - jmp .L998 - ALIGN_4 - -.L40: - movl M, I - sarl $2, I - jle .L60 - ALIGN_4 - -.L50: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addl INCX, X - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addl INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addl INCX, X - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addl INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decl I - jg .L50 - ALIGN_4 - -.L60: - movl M, I - andl $3, I - jle .L998 - ALIGN_4 - - -.L61: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addl INCX, X - faddp %st,%st(3) - faddp %st,%st(1) - decl I - jg .L61 - ALIGN_4 - -.L998: - faddp %st,%st(2) - faddp %st,%st(1) - faddp %st,%st(1) - ALIGN_4 - -.L999: - popl %ebx - popl %esi - ret - - EPILOGUE diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 92d121ab2..4874711bb 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -171,7 +171,7 @@ IXAMAXKERNEL = izamax.S endif ifndef ISAMINKERNEL -ISAMINKERNEL = iamax.S +ISAMINKERNEL = iamax_sse.S endif ifndef IDAMINKERNEL @@ -207,7 +207,7 @@ IQMAXKERNEL = iamax.S endif ifndef ISMINKERNEL -ISMINKERNEL = iamax.S +ISMINKERNEL = iamax_sse.S endif ifndef IDMINKERNEL diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index d61c51628..acc6356d6 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -7,10 +7,10 @@ SGEMMITCOPY = sgemm_tcopy_16_skylakex.c SGEMMONCOPY = sgemm_ncopy_4_skylakex.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -#DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c +DGEMMKERNEL = dgemm_kernel_4x8_skylakex.c -#DGEMMINCOPY = dgemm_ncopy_8_skylakex.c -#DGEMMITCOPY = dgemm_tcopy_8_skylakex.c +DGEMMINCOPY = dgemm_ncopy_8_skylakex.c +DGEMMITCOPY = dgemm_tcopy_8_skylakex.c DGEMMONCOPY = dgemm_ncopy_8_skylakex.c DGEMMOTCOPY = dgemm_tcopy_8_skylakex.c diff --git a/kernel/x86_64/KERNEL.generic b/kernel/x86_64/KERNEL.generic index 7cb0cb836..a23e59f3f 100644 --- a/kernel/x86_64/KERNEL.generic +++ b/kernel/x86_64/KERNEL.generic @@ -94,11 +94,6 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c -SSUMKERNEL = ../arm/sum.c -DSUMKERNEL = ../arm/sum.c -CSUMKERNEL = ../arm/zsum.c -ZSUMKERNEL = ../arm/zsum.c - SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c index 6d33641e9..6d2530e81 100644 --- a/kernel/x86_64/dgemv_n_4.c +++ b/kernel/x86_64/dgemv_n_4.c @@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c index da0fa2fff..584a6c6b5 100644 --- a/kernel/x86_64/dgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c @@ -104,7 +104,6 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "r" (ap[3]), // 7 "r" (alpha) // 8 : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", "%xmm8", "%xmm9", diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c index 466931b82..530780bab 100644 --- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c +++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c @@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastsd (%3), %%ymm12 \n\t" // x0 - "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 - "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 - "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 - "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 - "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 - "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 - "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 + "vbroadcastsd (%2), %%ymm12 \n\t" // x0 + "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 + "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 + "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 + "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 + "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 + "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 + "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 "vbroadcastsd (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y + "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y + "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y - "addq $4 , %2 \n\t" + "addq $4 , %8 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y - "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y - - "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" - "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" - "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" - "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" - "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" - "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" - "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" - "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" - - "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" + "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y + "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y + + "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" + "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" + "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" + "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" + "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" + "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" + "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" + "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" + + "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" "addq $8 , %0 \n\t" - "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" - "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" - "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" - "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" - "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" - "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" - "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" + "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" + "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" + "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" + "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" + "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" + "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" + "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $8 , %2 \n\t" + "addq $8 , %8 \n\t" "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y "subq $8 , %1 \n\t" - "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y + "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y "jnz 1b \n\t" @@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 + "+r" (n) // 1 : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c index ed672a757..a7478e3a8 100644 --- a/kernel/x86_64/dgemv_t_4.c +++ b/kernel/x86_64/dgemv_t_4.c @@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movsd %%xmm11,8(%2) \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movsd %%xmm10, (%2) \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c index d0d7801fd..ef9a0a6ba 100644 --- a/kernel/x86_64/dscal.c +++ b/kernel/x86_64/dscal.c @@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ "jnz 1b \n\t" : - "+r" (n) // 0 : + "r" (n), // 0 "r" (x), // 1 "r" (x1), // 2 "r" (alpha), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c index bfa07b6d0..d84470cc4 100644 --- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c @@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c index 6241879d5..866782ee6 100644 --- a/kernel/x86_64/dsymv_L_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c @@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c index a161dcd8b..38479f77a 100644 --- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c @@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c index b205b1019..b4e6ab369 100644 --- a/kernel/x86_64/dsymv_L_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c @@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c index ae287b6d8..d7166fe4b 100644 --- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c @@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c index 4778f644a..d83d20f8e 100644 --- a/kernel/x86_64/dsymv_U_microk_haswell-2.c +++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c @@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c index 065182286..1344c75f7 100644 --- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c @@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movsd %%xmm3 , 24(%9) \n\t" // save temp2 : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c index d84e703bd..1ef6fbafd 100644 --- a/kernel/x86_64/dsymv_U_microk_sandy-2.c +++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c @@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c index 9ab78fc8e..fcab8e2c7 100644 --- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c +++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c @@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " cmpq $0, %0 \n\t" " je 4f \n\t" - " vmovups (%8,%1,4), %%ymm0 \n\t" // read a - " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 - " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + " vmovups (%2,%1,4), %%ymm0 \n\t" // read a + " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 + " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " addq $8, %1 \n\t" @@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .p2align 4 \n\t" "1: \n\t" - " vmovups (%8,%1,4), %%ymm4 \n\t" // read a + " vmovups (%2,%1,4), %%ymm4 \n\t" // read a " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 + " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" - " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 + " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" @@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 22f \n\t" - " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + " vmovups (%2,%1,4), %%ymm0 \n\t" // read a " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" - " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" - " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" @@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" - " vmovups (%3), %%ymm0 \n\t" + " vmovups (%9), %%ymm0 \n\t" " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" @@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" - " vmovups 32(%3), %%ymm4 \n\t" + " vmovups 32(%9), %%ymm4 \n\t" " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" @@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "5: \n\t" // i = 0 - " addq $64, %3 \n\t" // b=b+8 + " addq $64, %9 \n\t" // b=b+8 " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb - " vmovups (%3), %%ymm0 \n\t" - " vmovups %%ymm8 , (%2) \n\t" // write a + " vmovups (%9), %%ymm0 \n\t" + " vmovups %%ymm8 , (%8) \n\t" // write a " vmovups %%ymm8 , (%4) \n\t" // write c " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" - " vmovups 32(%3), %%ymm1 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" @@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb - " vmovups (%3), %%ymm0 \n\t" - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm9 , (%2) \n\t" // write a + " vmovups (%9), %%ymm0 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm9 , (%8) \n\t" // write a " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" @@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb - " vmovups (%3), %%ymm0 \n\t" - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm10, (%2) \n\t" // write a + " vmovups (%9), %%ymm0 \n\t" + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm10, (%8) \n\t" // write a " vmovups %%ymm10, (%4,%7,2) \n\t" // write c " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" @@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm11, (%2) \n\t" // write a + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm11, (%8) \n\t" // write a " vmovups %%ymm11, (%5) \n\t" // write c " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" @@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm12, (%2) \n\t" // write a + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm12, (%8) \n\t" // write a " vmovups %%ymm12, (%5,%7,1) \n\t" // write c " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" @@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm13, (%2) \n\t" // write a + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm13, (%8) \n\t" // write a " vmovups %%ymm13, (%5,%7,2) \n\t" // write c " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" @@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - " addq $64, %3 \n\t" // b=b+8 - " addq $32, %2 \n\t" // a=a+8 + " addq $64, %9 \n\t" // b=b+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb - " vmovups 32(%3), %%ymm1 \n\t" - " vmovups %%ymm14, (%2) \n\t" // write a + " vmovups 32(%9), %%ymm1 \n\t" + " vmovups %%ymm14, (%8) \n\t" // write a " vmovups %%ymm14, (%6) \n\t" // write c " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - " addq $32, %2 \n\t" // a=a+8 + " addq $32, %8 \n\t" // a=a+8 " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb - " vmovups %%ymm15, (%2) \n\t" // write a + " vmovups %%ymm15, (%8) \n\t" // write a " vmovups %%ymm15, (%6,%7,1) \n\t" // write c " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 "r" (c), // 4 "r" (c3), // 5 "r" (c6), // 6 "r" (ldc), // 7 - "r" (a), // 8 - "r" (b) // 9 + "r" (as), // 8 + "r" (bs) // 9 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c index 35ed4cc01..54df5b359 100644 --- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c @@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " prefetcht0 384(%6,%1,8) \n\t" - " prefetcht0 384(%7,%1,8) \n\t" - " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vmovddup 8(%7,%1,2), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%2,%1,8) \n\t" + " prefetcht0 384(%3,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%6,%1,8) \n\t" - " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vmovddup 8(%7,%1,2), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%6,%1,8) \n\t" - " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vmovddup 8(%7,%1,2), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " jz 2f \n\t" - " prefetcht0 384(%6,%1,8) \n\t" - " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b - " vmovddup 8(%7,%1,2), %%xmm1 \n\t" - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " prefetcht0 384(%2,%1,8) \n\t" + " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b + " vmovddup 8(%3,%1,2), %%xmm1 \n\t" + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vmovddup (%3), %%xmm1 \n\t" // read b - " vmovddup 8(%3), %%xmm0 \n\t" // read bb + " vmovddup (%7), %%xmm1 \n\t" // read b + " vmovddup 8(%7), %%xmm0 \n\t" // read bb " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%2) \n\t" // write a - " vmovups %%xmm13 , 16(%2) \n\t" // write a - " vmovups %%xmm14 , 32(%2) \n\t" // write a - " vmovups %%xmm15 , 48(%2) \n\t" // write a + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $16 , %3 \n\t" // b = b - 2 - " subq $64 , %2 \n\t" // a = a - 8 + " subq $16 , %7 \n\t" // b = b - 2 + " subq $64 , %6 \n\t" // a = a - 8 - " vmovddup (%3), %%xmm0 \n\t" // read bb + " vmovddup (%7), %%xmm0 \n\t" // read bb " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%2) \n\t" // write a - " vmovups %%xmm9 , 16(%2) \n\t" - " vmovups %%xmm10 , 32(%2) \n\t" - " vmovups %%xmm11 , 48(%2) \n\t" + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (a), // 6 - "r" (b) // 7 + "r" (as), // 6 + "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/gemm_kernel_4x8_nano.S b/kernel/x86_64/gemm_kernel_4x8_nano.S index e29520fa1..074562804 100644 --- a/kernel/x86_64/gemm_kernel_4x8_nano.S +++ b/kernel/x86_64/gemm_kernel_4x8_nano.S @@ -135,7 +135,7 @@ #endif movq %rsp, %rbx # save old stack - subq $256 + LOCAL_BUFFER_SIZE, %rsp + subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING diff --git a/kernel/x86_64/gemm_kernel_8x4_sse.S b/kernel/x86_64/gemm_kernel_8x4_sse.S index 1602c13c5..c4ef1f809 100644 --- a/kernel/x86_64/gemm_kernel_8x4_sse.S +++ b/kernel/x86_64/gemm_kernel_8x4_sse.S @@ -383,7 +383,7 @@ EMMS movq %rsp, %rbx # save old stack - subq $256 + LOCAL_BUFFER_SIZE, %rsp + subq $128 + LOCAL_BUFFER_SIZE, %rsp andq $-4096, %rsp # align stack STACK_TOUCHING diff --git a/kernel/x86_64/iamax_sse.S b/kernel/x86_64/iamax_sse.S index d50c1699c..f22e34a1d 100644 --- a/kernel/x86_64/iamax_sse.S +++ b/kernel/x86_64/iamax_sse.S @@ -36,10 +36,6 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -/* This kernel was found to give wrong results when used for ISMIN/ISAMIN - with increment != 1, although it appears to be correct for corresponding - MAX operations. See issue 2116 */ - #define ASSEMBLER #include "common.h" @@ -52,11 +48,9 @@ #define XX %r10 #define MM %r11 -#define MAXPS maxps -#define MAXSS maxss #ifdef USE_MIN -#define MAXPS minps -#define MAXSS minss +#define maxps minps +#define maxss minss #endif #include "l1param.h" @@ -109,7 +103,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 decq M addq $SIZE, X ALIGN_3 @@ -123,7 +117,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXPS %xmm4, %xmm1 + maxps %xmm4, %xmm1 subq $2, M addq $2 * SIZE, X ALIGN_3 @@ -143,25 +137,25 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXPS %xmm4, %xmm0 + maxps %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXPS %xmm5, %xmm1 + maxps %xmm5, %xmm1 movaps 8 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXPS %xmm6, %xmm2 + maxps %xmm6, %xmm2 movaps 12 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXPS %xmm7, %xmm3 + maxps %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -179,13 +173,13 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXPS %xmm4, %xmm0 + maxps %xmm4, %xmm0 movaps 4 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXPS %xmm5, %xmm1 + maxps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -197,7 +191,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXPS %xmm6, %xmm2 + maxps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -210,7 +204,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXPS %xmm7, %xmm3 + maxps %xmm7, %xmm3 addq $2 * SIZE, X .L18: @@ -221,22 +215,22 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 ALIGN_3 .L20: movq XX, X movq MM, M - MAXPS %xmm1, %xmm0 - MAXPS %xmm3, %xmm2 - MAXPS %xmm2, %xmm0 + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - MAXPS %xmm1, %xmm0 + maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - MAXSS %xmm1, %xmm0 + maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 testq $4, X @@ -433,28 +427,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXPS %xmm4, %xmm0 + maxps %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXPS %xmm5, %xmm1 + maxps %xmm5, %xmm1 movsd 8 * SIZE(X), %xmm6 movhps 10 * SIZE(X), %xmm6 #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXPS %xmm6, %xmm2 + maxps %xmm6, %xmm2 movsd 12 * SIZE(X), %xmm7 movhps 14 * SIZE(X), %xmm7 #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXPS %xmm7, %xmm3 + maxps %xmm7, %xmm3 addq $16 * SIZE, X decq I @@ -473,14 +467,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXPS %xmm4, %xmm0 + maxps %xmm4, %xmm0 movsd 4 * SIZE(X), %xmm5 movhps 6 * SIZE(X), %xmm5 #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXPS %xmm5, %xmm1 + maxps %xmm5, %xmm1 addq $8 * SIZE, X ALIGN_3 @@ -494,7 +488,7 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXPS %xmm6, %xmm2 + maxps %xmm6, %xmm2 addq $4 * SIZE, X ALIGN_3 @@ -507,7 +501,7 @@ #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXPS %xmm7, %xmm3 + maxps %xmm7, %xmm3 addq $2 * SIZE, X .L38: @@ -518,7 +512,7 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 jmp .L40 ALIGN_4 @@ -526,15 +520,15 @@ movq XX, X movq MM, M - MAXPS %xmm1, %xmm0 - MAXPS %xmm3, %xmm2 - MAXPS %xmm2, %xmm0 + maxps %xmm1, %xmm0 + maxps %xmm3, %xmm2 + maxps %xmm2, %xmm0 movaps %xmm0, %xmm1 movhlps %xmm0, %xmm0 - MAXPS %xmm1, %xmm0 + maxps %xmm1, %xmm0 movaps %xmm0, %xmm1 shufps $1, %xmm0, %xmm0 - MAXSS %xmm1, %xmm0 + maxss %xmm1, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I @@ -693,56 +687,56 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXSS %xmm5, %xmm1 + maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXSS %xmm6, %xmm2 + maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXSS %xmm7, %xmm3 + maxss %xmm7, %xmm3 movss 0 * SIZE(X), %xmm4 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXSS %xmm5, %xmm1 + maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXSS %xmm6, %xmm2 + maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXSS %xmm7, %xmm3 + maxss %xmm7, %xmm3 decq I jg .L81 @@ -760,28 +754,28 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXSS %xmm5, %xmm1 + maxss %xmm5, %xmm1 movss 0 * SIZE(X), %xmm6 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXSS %xmm6, %xmm2 + maxss %xmm6, %xmm2 movss 0 * SIZE(X), %xmm7 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm7 #endif - MAXSS %xmm7, %xmm3 + maxss %xmm7, %xmm3 ALIGN_3 .L86: @@ -793,14 +787,14 @@ #ifdef USE_ABS andps %xmm15, %xmm4 #endif - MAXSS %xmm4, %xmm0 + maxss %xmm4, %xmm0 movss 0 * SIZE(X), %xmm5 addq INCX, X #ifdef USE_ABS andps %xmm15, %xmm5 #endif - MAXSS %xmm5, %xmm1 + maxss %xmm5, %xmm1 ALIGN_3 .L87: @@ -812,16 +806,16 @@ #ifdef USE_ABS andps %xmm15, %xmm6 #endif - MAXSS %xmm6, %xmm2 + maxss %xmm6, %xmm2 ALIGN_4 .L90: movq XX, X movq MM, M - MAXSS %xmm1, %xmm0 - MAXSS %xmm3, %xmm2 - MAXSS %xmm2, %xmm0 + maxss %xmm1, %xmm0 + maxss %xmm3, %xmm2 + maxss %xmm2, %xmm0 shufps $0, %xmm0, %xmm0 movq M, I diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index 63697970f..65305ac59 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap[0]), // 4 @@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a "3: \n\t" : - "+r" (i), // 0 - "+r" (n1) // 1 : + "r" (i), // 0 + "r" (n1), // 1 "r" (x), // 2 "r" (y), // 3 "r" (ap), // 4 @@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 : + "r" (i), // 0 + "r" (n), // 1 "r" (src), // 2 "r" (dest) // 3 : "cc", diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c index bbf06c84b..31001c7f3 100644 --- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c @@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "vbroadcastss (%3), %%xmm12 \n\t" // x0 - "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 - "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 - "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 - "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 - "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 - "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 - "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 + "vbroadcastss (%2), %%xmm12 \n\t" // x0 + "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 + "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 + "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 + "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 + "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 + "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 + "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 "vbroadcastss (%9), %%xmm8 \n\t" // alpha @@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" "addq $4 , %0 \n\t" - "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" - "addq $4 , %2 \n\t" + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" + "addq $4 , %8 \n\t" "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" - "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" "subq $4 , %1 \n\t" - "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y "2: \n\t" @@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" - "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" - "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y - "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y + "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y "addq $8 , %0 \n\t" - "addq $8 , %2 \n\t" + "addq $8 , %8 \n\t" "subq $8 , %1 \n\t" @@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" "prefetcht0 192(%6,%0,4) \n\t" - "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" - "prefetcht0 192(%8,%0,4) \n\t" - "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" ".align 2 \n\t" - "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" - - "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" - - "prefetcht0 192(%5,%2,4) \n\t" - "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" - "prefetcht0 192(%6,%2,4) \n\t" - "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" - "prefetcht0 192(%7,%2,4) \n\t" - "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" - "prefetcht0 192(%8,%2,4) \n\t" - "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" - "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" + + "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" + + "prefetcht0 192(%4,%8,4) \n\t" + "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" + "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" - "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" - "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" - "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" + "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" + "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" - "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" - "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" - "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" - "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" + "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" + "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" + "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" "addq $16, %0 \n\t" - "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y - "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y - "addq $16, %2 \n\t" - "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y - "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y + "addq $16, %8 \n\t" + "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y + "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 + "+r" (n) // 1 : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c index 93e1e26e8..2c90f8aa9 100644 --- a/kernel/x86_64/sgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c @@ -26,6 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #define HAVE_KERNEL_4x8 1 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); @@ -37,41 +38,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%3), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha "testq $0x04, %1 \n\t" "jz 2f \n\t" - "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" - "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" - "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" - "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" + "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" + "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" + "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" + "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" - "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" - "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" - "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" - "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" + "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" + "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" + "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y - "addq $4 , %2 \n\t" + "addq $4 , %8 \n\t" "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" @@ -80,28 +81,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "testq $0x08, %1 \n\t" "jz 3f \n\t" - "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" - "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" - "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y - "addq $8 , %2 \n\t" + "addq $8 , %8 \n\t" "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" @@ -116,35 +117,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y - "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y - - "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" - "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" - "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" - "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" - "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" - "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" - "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" - "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" - - "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y + "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y + + "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" + "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" + "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" + "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" + "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" + "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" + "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" + "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" + + "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" "addq $16, %0 \n\t" - "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" - "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" - "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" - "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" - "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" - "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" - "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" + "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" + "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" + "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" + "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" + "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" + "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" + "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - "addq $16, %2 \n\t" - "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y + "addq $16, %8 \n\t" + "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y "subq $16, %1 \n\t" - "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y "jnz 1b \n\t" @@ -153,15 +154,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 + "+r" (n) // 1 : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", @@ -176,6 +177,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO } + #define HAVE_KERNEL_4x4 1 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); @@ -194,7 +196,6 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vbroadcastss (%8), %%ymm6 \n\t" // alpha - "testq $0x04, %1 \n\t" "jz 2f \n\t" diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c index d21232bfa..11a3e943b 100644 --- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c @@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( - "movss (%3), %%xmm12 \n\t" // x0 - "movss 4(%3), %%xmm13 \n\t" // x1 - "movss 8(%3), %%xmm14 \n\t" // x2 - "movss 12(%3), %%xmm15 \n\t" // x3 + "movss (%2), %%xmm12 \n\t" // x0 + "movss 4(%2), %%xmm13 \n\t" // x1 + "movss 8(%2), %%xmm14 \n\t" // x2 + "movss 12(%2), %%xmm15 \n\t" // x3 "shufps $0, %%xmm12, %%xmm12\n\t" "shufps $0, %%xmm13, %%xmm13\n\t" "shufps $0, %%xmm14, %%xmm14\n\t" "shufps $0, %%xmm15, %%xmm15\n\t" - "movss 16(%3), %%xmm0 \n\t" // x4 - "movss 20(%3), %%xmm1 \n\t" // x5 - "movss 24(%3), %%xmm2 \n\t" // x6 - "movss 28(%3), %%xmm3 \n\t" // x7 + "movss 16(%2), %%xmm0 \n\t" // x4 + "movss 20(%2), %%xmm1 \n\t" // x5 + "movss 24(%2), %%xmm2 \n\t" // x6 + "movss 28(%2), %%xmm3 \n\t" // x7 "shufps $0, %%xmm0 , %%xmm0 \n\t" "shufps $0, %%xmm1 , %%xmm1 \n\t" "shufps $0, %%xmm2 , %%xmm2 \n\t" @@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "1: \n\t" "xorps %%xmm4 , %%xmm4 \n\t" "xorps %%xmm5 , %%xmm5 \n\t" - "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y + "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ".p2align 1 \n\t" - "movups (%5,%0,4), %%xmm8 \n\t" - "movups (%6,%0,4), %%xmm9 \n\t" - "movups (%7,%0,4), %%xmm10 \n\t" - "movups (%8,%0,4), %%xmm11 \n\t" + "movups (%4,%0,4), %%xmm8 \n\t" + "movups (%5,%0,4), %%xmm9 \n\t" + "movups (%6,%0,4), %%xmm10 \n\t" + "movups (%7,%0,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm12, %%xmm8 \n\t" "mulps %%xmm13, %%xmm9 \n\t" @@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "movups (%5,%2,4), %%xmm8 \n\t" - "movups (%6,%2,4), %%xmm9 \n\t" - "movups (%7,%2,4), %%xmm10 \n\t" - "movups (%8,%2,4), %%xmm11 \n\t" + "movups (%4,%8,4), %%xmm8 \n\t" + "movups (%5,%8,4), %%xmm9 \n\t" + "movups (%6,%8,4), %%xmm10 \n\t" + "movups (%7,%8,4), %%xmm11 \n\t" ".p2align 1 \n\t" "mulps %%xmm0 , %%xmm8 \n\t" "mulps %%xmm1 , %%xmm9 \n\t" @@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "addps %%xmm10, %%xmm4 \n\t" "addps %%xmm11, %%xmm5 \n\t" - "addq $4 , %2 \n\t" + "addq $4 , %8 \n\t" "addps %%xmm5 , %%xmm4 \n\t" "addq $4 , %0 \n\t" "mulps %%xmm6 , %%xmm4 \n\t" "subq $4 , %1 \n\t" "addps %%xmm4 , %%xmm7 \n\t" - "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y + "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y "jnz 1b \n\t" : "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 + "+r" (n) // 1 : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c index 3fc46542b..b35daa35b 100644 --- a/kernel/x86_64/sgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c @@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO __asm__ __volatile__ ( "vzeroupper \n\t" - "vbroadcastss (%3), %%ymm12 \n\t" // x0 - "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 - "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 - "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 - "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 - "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 - "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 - "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + "vbroadcastss (%2), %%ymm12 \n\t" // x0 + "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 + "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 + "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 + "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 + "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 + "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 + "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 "vbroadcastss (%9), %%ymm6 \n\t" // alpha @@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" - "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y - "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" - "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" - "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" - "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" + "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" + "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" + "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" + "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" - "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" - "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" - "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" + "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" + "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" + "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" + "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" @@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y - "addq $4, %2 \n\t" + "addq $4, %8 \n\t" "addq $4, %0 \n\t" "subq $4, %1 \n\t" @@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y - "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" - "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" - "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" - "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" - "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" @@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y - "addq $8, %2 \n\t" + "addq $8, %8 \n\t" "addq $8, %0 \n\t" "subq $8, %1 \n\t" @@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + "prefetcht0 192(%4,%0,4) \n\t" + "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" + "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" "prefetcht0 192(%5,%0,4) \n\t" - "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" - "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" - "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" - "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" + "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" + "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" + "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" + "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" "prefetcht0 192(%7,%0,4) \n\t" - "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" - "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" - "prefetcht0 192(%8,%0,4) \n\t" - "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" - "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" + "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%5,%2,4) \n\t" - "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" - "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" - "prefetcht0 192(%6,%2,4) \n\t" - "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" - "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" + "prefetcht0 192(%4,%8,4) \n\t" + "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" + "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" + "prefetcht0 192(%5,%8,4) \n\t" + "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" + "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - "prefetcht0 192(%7,%2,4) \n\t" - "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" - "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" - "prefetcht0 192(%8,%2,4) \n\t" - "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" - "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "prefetcht0 192(%6,%8,4) \n\t" + "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" + "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" + "prefetcht0 192(%7,%8,4) \n\t" + "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" + "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" @@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y - "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y + "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y - "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y + "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y + "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y - "addq $16, %2 \n\t" + "addq $16, %8 \n\t" "addq $16, %0 \n\t" "subq $16, %1 \n\t" "jnz 1b \n\t" @@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO : "+r" (i), // 0 - "+r" (n), // 1 - "+r" (lda4) // 2 + "+r" (n) // 1 : - "r" (x), // 3 - "r" (y), // 4 - "r" (ap[0]), // 5 - "r" (ap[1]), // 6 - "r" (ap[2]), // 7 - "r" (ap[3]), // 8 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 + "r" (ap[1]), // 5 + "r" (ap[2]), // 6 + "r" (ap[3]), // 7 + "r" (lda4), // 8 "r" (alpha) // 9 : "cc", "%xmm0", "%xmm1", diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index 86ecaf516..065e5b385 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT "movss %%xmm11,4(%2) \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (y), // 2 "r" (ap0), // 3 "r" (ap1), // 4 @@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "movss %%xmm10, (%2) \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (y), // 2 "r" (ap), // 3 "r" (x) // 4 @@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d "jnz 1b \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (&da), // 2 "r" (src), // 3 "r" (dest) // 4 diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index 602c3edf2..9002228f3 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c index fdfe4349a..69db008b6 100644 --- a/kernel/x86_64/ssymv_L_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c @@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index 6bb9c02f6..c0fe5d640 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c index 0c78212e7..093ca8073 100644 --- a/kernel/x86_64/ssymv_L_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c @@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 @@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vzeroupper \n\t" : - "+r" (from) // 0 - : + : + "r" (from), // 0 "r" (to), // 1 "r" (x), // 2 "r" (y), // 3 diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index 4a4f4d68d..8c01ab806 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c index e6a09ccf8..a32e59b44 100644 --- a/kernel/x86_64/ssymv_U_microk_haswell-2.c +++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c @@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index c56ff3b15..b8e6ee732 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "movss %%xmm3 , 12(%9) \n\t" // save temp2 : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c index c4919a39a..e8650650c 100644 --- a/kernel/x86_64/ssymv_U_microk_sandy-2.c +++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c @@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "vzeroupper \n\t" : - "+r" (i), // 0 - "+r" (n) // 1 - : + : + "r" (i), // 0 + "r" (n), // 1 "r" (x), // 2 "r" (y), // 3 "r" (a0), // 4 diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c index 3cd215000..1b8991c6c 100644 --- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c @@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] + " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] + " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] + " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] + " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] + " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] + " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] + " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] + " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] + " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " subq $64 , %2 \n\t" // a -= m - " subq $8 , %3 \n\t" // b -= n + " subq $64 , %6 \n\t" // a -= m + " subq $8 , %7 \n\t" // b -= n - " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] + " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 "r" (c), // 4 "r" (c1), // 5 - "r" (a), // 6 - "r" (b) // 7 + "r" (as), // 6 + "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c index a4a62491c..0623dddb0 100644 --- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" - " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] + " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] + " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] + " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" @@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] + " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] + " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] + " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] + " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" @@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] + " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] + " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] + " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa - " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - " addq $64 , %2 \n\t" // a -= m - " addq $8 , %3 \n\t" // b -= n + " addq $64 , %6 \n\t" // a -= m + " addq $8 , %7 \n\t" // b -= n - " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa - " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa - " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa + " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : - "r" (c), // 4 - "r" (c1), // 5 - "r" (a), // 6 - "r" (b) // 7 + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c index c11c84cec..4cc557d55 100644 --- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c @@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 0 - " vbroadcastss (%3), %%xmm0 \n\t" // read bb - " vbroadcastss 4(%3), %%xmm1 \n\t" // read b + " vbroadcastss (%7), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%7), %%xmm1 \n\t" // read b " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%2) \n\t" // write a - " vmovups %%xmm9 , 16(%2) \n\t" - " vmovups %%xmm10 , 32(%2) \n\t" - " vmovups %%xmm11 , 48(%2) \n\t" + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" " \n\t" // i = 1 - " addq $8 , %3 \n\t" // b = b + 2 - " addq $64 , %2 \n\t" // a = a + 16 + " addq $8 , %7 \n\t" // b = b + 2 + " addq $64 , %6 \n\t" // a = a + 16 - " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%2) \n\t" // write a - " vmovups %%xmm13 , 16(%2) \n\t" // write a - " vmovups %%xmm14 , 32(%2) \n\t" // write a - " vmovups %%xmm15 , 48(%2) \n\t" // write a + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : - "r" (c), // 4 - "r" (c1), // 5 - "r" (a), // 6 - "r" (b) // 7 + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c index 326ca2976..73f6e8a95 100644 --- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c +++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c @@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " .align 16 \n\t" "1: \n\t" - " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b - " vmovups (%6,%1,8), %%xmm4 \n\t" - " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" - " vmovups 16(%6,%1,8), %%xmm5 \n\t" - " vmovups 32(%6,%1,8), %%xmm6 \n\t" - " vmovups 48(%6,%1,8), %%xmm7 \n\t" + " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b + " vmovups (%2,%1,8), %%xmm4 \n\t" + " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" + " vmovups 16(%2,%1,8), %%xmm5 \n\t" + " vmovups 32(%2,%1,8), %%xmm6 \n\t" + " vmovups 48(%2,%1,8), %%xmm7 \n\t" " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" @@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON "3: \n\t" // i = 1 - " vbroadcastss (%3), %%xmm1 \n\t" // read b - " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + " vbroadcastss (%7), %%xmm1 \n\t" // read b + " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - " vmovups %%xmm12 , (%2) \n\t" // write a - " vmovups %%xmm13 , 16(%2) \n\t" // write a - " vmovups %%xmm14 , 32(%2) \n\t" // write a - " vmovups %%xmm15 , 48(%2) \n\t" // write a + " vmovups %%xmm12 , (%6) \n\t" // write a + " vmovups %%xmm13 , 16(%6) \n\t" // write a + " vmovups %%xmm14 , 32(%6) \n\t" // write a + " vmovups %%xmm15 , 48(%6) \n\t" // write a " vmovups %%xmm12 , (%5) \n\t" // write c1 " vmovups %%xmm13 , 16(%5) \n\t" @@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" " \n\t" // i = 0 - " subq $8 , %3 \n\t" // b = b - 2 - " subq $64 , %2 \n\t" // a = a - 16 + " subq $8 , %7 \n\t" // b = b - 2 + " subq $64 , %6 \n\t" // a = a - 16 - " vbroadcastss (%3), %%xmm0 \n\t" // read bb + " vbroadcastss (%7), %%xmm0 \n\t" // read bb " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - " vmovups %%xmm8 , (%2) \n\t" // write a - " vmovups %%xmm9 , 16(%2) \n\t" - " vmovups %%xmm10 , 32(%2) \n\t" - " vmovups %%xmm11 , 48(%2) \n\t" + " vmovups %%xmm8 , (%6) \n\t" // write a + " vmovups %%xmm9 , 16(%6) \n\t" + " vmovups %%xmm10 , 32(%6) \n\t" + " vmovups %%xmm11 , 48(%6) \n\t" " vmovups %%xmm8 , (%4) \n\t" // write c0 " vmovups %%xmm9 , 16(%4) \n\t" @@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON " vzeroupper \n\t" : - "+r" (n1), // 0 - "+a" (i), // 1 - "+r" (as), // 2 - "+r" (bs) // 3 : - "r" (c), // 4 - "r" (c1), // 5 - "r" (a), // 6 - "r" (b) // 7 + "r" (n1), // 0 + "a" (i), // 1 + "r" (a), // 2 + "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 + "r" (as), // 6 + "r" (bs) // 7 : "cc", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/kernel/x86_64/sum.S b/kernel/x86_64/sum.S deleted file mode 100644 index d075eaa04..000000000 --- a/kernel/x86_64/sum.S +++ /dev/null @@ -1,179 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M ARG1 -#define X ARG2 -#define INCX ARG3 - -#define I %rax - -#include "l1param.h" - - PROLOGUE - PROFCODE - - fldz - testq M, M - jle .L999 - testq INCX, INCX - jle .L999 - - salq $BASE_SHIFT, INCX - - fldz - fldz - fldz - cmpq $SIZE, INCX - jne .L40 - - movq M, I - sarq $3, I - jle .L20 - ALIGN_4 - -.L10: -#ifdef PREFETCH - PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) -#endif - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - FLD 2 * SIZE(X) - FLD 3 * SIZE(X) - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 4 * SIZE(X) - FLD 5 * SIZE(X) - FLD 6 * SIZE(X) - FLD 7 * SIZE(X) - - addq $8 * SIZE, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decq I - jg .L10 - ALIGN_4 - -.L20: - andq $7, M - jle .L998 - ALIGN_4 - -.L21: - FLD (X) - faddp %st,%st(1) - addq $1 * SIZE, X - decq M - jg .L21 - jmp .L998 - ALIGN_4 - -.L40: - movq M, I - sarq $3, I - jle .L60 - ALIGN_4 - -.L50: - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - FLD (X) - addq INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decq I - jg .L50 - ALIGN_4 - -.L60: - andq $7, M - jle .L998 - ALIGN_4 - - -.L61: - FLD (X) - addq INCX, X - faddp %st,%st(1) - decq M - jg .L61 - ALIGN_4 - -.L998: - faddp %st,%st(2) - faddp %st,%st(1) - faddp %st,%st(1) - ALIGN_4 - -.L999: - ret - - EPILOGUE diff --git a/kernel/x86_64/zsum.S b/kernel/x86_64/zsum.S deleted file mode 100644 index 45e0ddff5..000000000 --- a/kernel/x86_64/zsum.S +++ /dev/null @@ -1,180 +0,0 @@ -/*********************************************************************/ -/* Copyright 2009, 2010 The University of Texas at Austin. */ -/* All rights reserved. */ -/* */ -/* Redistribution and use in source and binary forms, with or */ -/* without modification, are permitted provided that the following */ -/* conditions are met: */ -/* */ -/* 1. Redistributions of source code must retain the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer. */ -/* */ -/* 2. Redistributions in binary form must reproduce the above */ -/* copyright notice, this list of conditions and the following */ -/* disclaimer in the documentation and/or other materials */ -/* provided with the distribution. */ -/* */ -/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ -/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ -/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ -/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ -/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ -/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ -/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ -/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ -/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ -/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ -/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ -/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ -/* POSSIBILITY OF SUCH DAMAGE. */ -/* */ -/* The views and conclusions contained in the software and */ -/* documentation are those of the authors and should not be */ -/* interpreted as representing official policies, either expressed */ -/* or implied, of The University of Texas at Austin. */ -/*********************************************************************/ - -#define ASSEMBLER -#include "common.h" - -#define M ARG1 -#define X ARG2 -#define INCX ARG3 - -#define I %rax - -#include "l1param.h" - - PROLOGUE - PROFCODE - - fldz - testq M, M - jle .L999 - testq INCX, INCX - jle .L999 - - salq $ZBASE_SHIFT, INCX - - fldz - fldz - fldz - cmpq $SIZE * 2, INCX - jne .L40 - - movq M, I - sarq $2, I - jle .L20 - ALIGN_4 - -.L10: -#ifdef PREFETCH - PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) -#endif - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - FLD 2 * SIZE(X) - FLD 3 * SIZE(X) - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 4 * SIZE(X) - FLD 5 * SIZE(X) - FLD 6 * SIZE(X) - FLD 7 * SIZE(X) - - addq $8 * SIZE, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decq I - jg .L10 - ALIGN_4 - -.L20: - andq $3, M - jle .L998 - ALIGN_4 - - -.L21: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - faddp %st,%st(3) - faddp %st,%st(1) - addq $2 * SIZE, X - decq M - jg .L21 - jmp .L998 - ALIGN_4 - -.L40: - movq M, I - sarq $2, I - jle .L60 - ALIGN_4 - -.L50: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addq INCX, X - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addq INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addq INCX, X - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addq INCX, X - - faddp %st, %st(7) - faddp %st, %st(5) - faddp %st, %st(3) - faddp %st, %st(1) - - decq I - jg .L50 - ALIGN_4 - -.L60: - andq $3, M - jle .L998 - ALIGN_4 - - -.L61: - FLD 0 * SIZE(X) - FLD 1 * SIZE(X) - addq INCX, X - faddp %st,%st(3) - faddp %st,%st(1) - decq M - jg .L61 - ALIGN_4 - -.L998: - faddp %st,%st(2) - faddp %st,%st(1) - faddp %st,%st(1) - ALIGN_4 - -.L999: - ret - - EPILOGUE diff --git a/kernel/zarch/KERNEL.Z13 b/kernel/zarch/KERNEL.Z13 index b1ffd3c54..add628bfe 100644 --- a/kernel/zarch/KERNEL.Z13 +++ b/kernel/zarch/KERNEL.Z13 @@ -1,18 +1,18 @@ SAMAXKERNEL = ../arm/amax.c -DAMAXKERNEL = damax_z13.c +DAMAXKERNEL = ../arm/amax.c CAMAXKERNEL = ../arm/zamax.c -ZAMAXKERNEL = zamax_z13.c +ZAMAXKERNEL = ../arm/zamax.c SAMINKERNEL = ../arm/amin.c -DAMINKERNEL = damin_z13.c +DAMINKERNEL = ../arm/amin.c CAMINKERNEL = ../arm/zamin.c -ZAMINKERNEL = zamin_z13.c +ZAMINKERNEL = ../arm/zamin.c SMAXKERNEL = ../arm/max.c -DMAXKERNEL = dmax_z13.c +DMAXKERNEL = ../arm/max.c SMINKERNEL = ../arm/min.c -DMINKERNEL = dmin_z13.c +DMINKERNEL = ../arm/min.c ISAMAXKERNEL = ../arm/iamax.c IDAMAXKERNEL = idamax.c @@ -25,21 +25,16 @@ ICAMINKERNEL = ../arm/izamin.c IZAMINKERNEL = izamin.c ISMAXKERNEL = ../arm/imax.c -IDMAXKERNEL = idmax.c +IDMAXKERNEL = ../arm/imax.c ISMINKERNEL = ../arm/imin.c -IDMINKERNEL = idmin.c +IDMINKERNEL = ../arm/imin.c SASUMKERNEL = ../arm/asum.c DASUMKERNEL = dasum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = zasum.c -SSUMKERNEL = ../arm/asum.c -DSUMKERNEL = dasum.c -CSUMKERNEL = ../arm/zasum.c -ZSUMKERNEL = zasum.c - SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = daxpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 deleted file mode 100644 index 971896c2d..000000000 --- a/kernel/zarch/KERNEL.Z14 +++ /dev/null @@ -1,151 +0,0 @@ -SAMAXKERNEL = samax.c -DAMAXKERNEL = damax.c -CAMAXKERNEL = camax.c -ZAMAXKERNEL = zamax.c - -SAMINKERNEL = samin.c -DAMINKERNEL = damin.c -CAMINKERNEL = camin.c -ZAMINKERNEL = zamin.c - -SMAXKERNEL = smax.c -DMAXKERNEL = dmax.c - -SMINKERNEL = smin.c -DMINKERNEL = dmin.c - -ISAMAXKERNEL = isamax.c -IDAMAXKERNEL = idamax.c -ICAMAXKERNEL = icamax.c -IZAMAXKERNEL = izamax.c - -ISAMINKERNEL = isamin.c -IDAMINKERNEL = idamin.c -ICAMINKERNEL = icamin.c -IZAMINKERNEL = izamin.c - -ISMAXKERNEL = ismax.c -IDMAXKERNEL = idmax.c - -ISMINKERNEL = ismin.c -IDMINKERNEL = idmin.c - -SASUMKERNEL = sasum.c -DASUMKERNEL = dasum.c -CASUMKERNEL = casum.c -ZASUMKERNEL = zasum.c - -SSUMKERNEL = ssum.c -DSUMKERNEL = dsum.c -CSUMKERNEL = csum.c -ZSUMKERNEL = zsum.c - -SAXPYKERNEL = saxpy.c -DAXPYKERNEL = daxpy.c -CAXPYKERNEL = caxpy.c -ZAXPYKERNEL = zaxpy.c - -SCOPYKERNEL = scopy.c -DCOPYKERNEL = dcopy.c -CCOPYKERNEL = ccopy.c -ZCOPYKERNEL = zcopy.c - -SDOTKERNEL = sdot.c -DDOTKERNEL = ddot.c -CDOTKERNEL = cdot.c -ZDOTKERNEL = zdot.c -DSDOTKERNEL = dsdot.c - -SNRM2KERNEL = ../arm/nrm2.c -DNRM2KERNEL = ../arm/nrm2.c -CNRM2KERNEL = ../arm/znrm2.c -ZNRM2KERNEL = ../arm/znrm2.c - -SROTKERNEL = srot.c -DROTKERNEL = drot.c -CROTKERNEL = crot.c -ZROTKERNEL = zrot.c - -SSCALKERNEL = sscal.c -DSCALKERNEL = dscal.c -CSCALKERNEL = cscal.c -ZSCALKERNEL = zscal.c - -SSWAPKERNEL = sswap.c -DSWAPKERNEL = dswap.c -CSWAPKERNEL = cswap.c -ZSWAPKERNEL = zswap.c - -SGEMVNKERNEL = sgemv_n_4.c -DGEMVNKERNEL = dgemv_n_4.c -CGEMVNKERNEL = cgemv_n_4.c -ZGEMVNKERNEL = zgemv_n_4.c - -SGEMVTKERNEL = sgemv_t_4.c -DGEMVTKERNEL = dgemv_t_4.c -CGEMVTKERNEL = cgemv_t_4.c -ZGEMVTKERNEL = zgemv_t_4.c - -STRMMKERNEL = strmm8x4V.S -DTRMMKERNEL = trmm8x4V.S -CTRMMKERNEL = ctrmm4x4V.S -ZTRMMKERNEL = ztrmm4x4V.S - -SGEMMKERNEL = strmm8x4V.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - - - -DGEMMKERNEL = gemm8x4V.S -DGEMMINCOPY = ../generic/gemm_ncopy_8.c -DGEMMITCOPY = ../generic/gemm_tcopy_8.c -DGEMMONCOPY = ../generic/gemm_ncopy_4.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMINCOPYOBJ = dgemm_incopy.o -DGEMMITCOPYOBJ = dgemm_itcopy.o -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ctrmm4x4V.S -CGEMMONCOPY = ../generic/zgemm_ncopy_4.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ztrmm4x4V.S -ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c -ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) -ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - - diff --git a/kernel/zarch/KERNEL.ZARCH_GENERIC b/kernel/zarch/KERNEL.ZARCH_GENERIC index 3bbeb9155..848ee9b54 100644 --- a/kernel/zarch/KERNEL.ZARCH_GENERIC +++ b/kernel/zarch/KERNEL.ZARCH_GENERIC @@ -35,11 +35,6 @@ DASUMKERNEL = ../arm/asum.c CASUMKERNEL = ../arm/zasum.c ZASUMKERNEL = ../arm/zasum.c -SSUMKERNEL = ../arm/sum.c -DSUMKERNEL = ../arm/sum.c -CSUMKERNEL = ../arm/zsum.c -ZSUMKERNEL = ../arm/zsum.c - SAXPYKERNEL = ../arm/axpy.c DAXPYKERNEL = ../arm/axpy.c CAXPYKERNEL = ../arm/zaxpy.c diff --git a/kernel/zarch/camax.c b/kernel/zarch/camax.c deleted file mode 100644 index b10ca4752..000000000 --- a/kernel/zarch/camax.c +++ /dev/null @@ -1,215 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - -static FLOAT camax_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = camax_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } else { - maxf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (maxf); - - } else { - - maxf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) > maxf) { - maxf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (maxf); - } -} diff --git a/kernel/zarch/camin.c b/kernel/zarch/camin.c deleted file mode 100644 index 40945fae8..000000000 --- a/kernel/zarch/camin.c +++ /dev/null @@ -1,215 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - -static FLOAT camin_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v16,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v16,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v16,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v16,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v16,%%v16\n\t" - "vfasb %%v0,%%v0,%%v16\n\t" - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,8,4\n\t" - "vleib %%v1,9,5\n\t" - "vleib %%v1,10,6\n\t" - "vleib %%v1,11,7\n\t" - "vleib %%v1,16,8\n\t" - "vleib %%v1,17,9\n\t" - "vleib %%v1,18,10\n\t" - "vleib %%v1,19,11\n\t" - "vleib %%v1,24,12\n\t" - "vleib %%v1,25,13\n\t" - "vleib %%v1,26,14\n\t" - "vleib %%v1,27,15\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v2,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v2\n\t" - "vperm %%v16,%%v16,%%v2,%%v1\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v2,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v2,%%v1\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v2,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v2,%%v1\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v2,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v2,%%v1\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v2,144(%%r1,%[x])\n\t" - "vpkg %%v25,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v2,%%v1\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v2,176(%%r1,%[x])\n\t" - "vpkg %%v27,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v2,%%v1\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v2,208(%%r1,%[x])\n\t" - "vpkg %%v29,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v2,%%v1\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v2,240(%%r1,%[x])\n\t" - "vpkg %%v31,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v2,%%v1\n\t" - "vflpsb %%v16,%%v16\n\t" - "vflpsb %%v17,%%v17\n\t" - "vflpsb %%v18,%%v18\n\t" - "vflpsb %%v19,%%v19\n\t" - "vflpsb %%v20,%%v20\n\t" - "vflpsb %%v21,%%v21\n\t" - "vflpsb %%v22,%%v22\n\t" - "vflpsb %%v23,%%v23\n\t" - "vflpsb %%v24,%%v24\n\t" - "vflpsb %%v25,%%v25\n\t" - "vflpsb %%v26,%%v26\n\t" - "vflpsb %%v27,%%v27\n\t" - "vflpsb %%v28,%%v28\n\t" - "vflpsb %%v29,%%v29\n\t" - "vflpsb %%v30,%%v30\n\t" - "vflpsb %%v31,%%v31\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v18,%%v18,%%v19\n\t" - "vfasb %%v20,%%v20,%%v21\n\t" - "vfasb %%v22,%%v22,%%v23\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v26,%%v26,%%v27\n\t" - "vfasb %%v28,%%v28,%%v29\n\t" - "vfasb %%v30,%%v30,%%v31\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = camin_kernel_32(n1, x); - ix = n1 * 2; - i = n1; - } else { - minf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (minf); - - } else { - - minf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) < minf) { - minf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) < minf) { - minf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) < minf) { - minf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (minf); - } -} diff --git a/kernel/zarch/casum.c b/kernel/zarch/casum.c deleted file mode 100644 index e28f2018c..000000000 --- a/kernel/zarch/casum.c +++ /dev/null @@ -1,155 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static FLOAT casum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return asum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ip = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (sumf); - - if (inc_x == 1) { - - n1 = n & -32; - if (n1 > 0) { - - sumf = casum_kernel_32(n1, x); - i = n1; - ip = 2 * n1; - } - - while (i < n) { - sumf += ABS(x[ip]) + ABS(x[ip + 1]); - i++; - ip += 2; - } - - } else { - inc_x2 = 2 * inc_x; - - while (i < n) { - sumf += ABS(x[ip]) + ABS(x[ip + 1]); - ip += inc_x2; - i++; - } - - } - return (sumf); -} diff --git a/kernel/zarch/caxpy.c b/kernel/zarch/caxpy.c deleted file mode 100644 index 14a124ae2..000000000 --- a/kernel/zarch/caxpy.c +++ /dev/null @@ -1,166 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - __asm__( -#if !defined(CONJ) - "vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" -#else - "vlef %%v0,0(%[alpha]),1\n\t" - "vlef %%v0,0(%[alpha]),3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,0(%[alpha]),0\n\t" - "vlef %%v0,0(%[alpha]),2\n\t" - "vlrepf %%v1,4(%[alpha])\n\t" -#endif - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "verllg %%v24,%%v8,32\n\t" - "verllg %%v25,%%v9,32\n\t" - "verllg %%v26,%%v10,32\n\t" - "verllg %%v27,%%v11,32\n\t" - "verllg %%v28,%%v16,32\n\t" - "verllg %%v29,%%v17,32\n\t" - "verllg %%v30,%%v18,32\n\t" - "verllg %%v31,%%v19,32\n\t" - "vfmasb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmasb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmasb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmasb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmasb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmasb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmasb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmasb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmasb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmasb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmasb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmasb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, - FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -16; - - if (n1) { - da[0] = da_r; - da[1] = da_i; - caxpy_kernel_16(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { -#if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); -#else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); -#endif - i++; - ix += 2; - - } - return (0); - - } - - inc_x *= 2; - inc_y *= 2; - - while (i < n) { - -#if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); -#else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); -#endif - ix += inc_x; - iy += inc_y; - i++; - - } - return (0); - -} diff --git a/kernel/zarch/ccopy.c b/kernel/zarch/ccopy.c deleted file mode 100644 index 0a5e03992..000000000 --- a/kernel/zarch/ccopy.c +++ /dev/null @@ -1,88 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void ccopy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],5\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), - [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) - : "cc"); -} - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - ccopy_kernel_32(n1, x, y); - i = n1; - ix = n1 * 2; - iy = n1 * 2; - } - - while (i < n) { - y[iy] = x[iy]; - y[iy + 1] = x[ix + 1]; - ix += 2; - iy += 2; - i++; - - } - - } else { - - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; - - while (i < n) { - y[iy] = x[ix]; - y[iy + 1] = x[ix + 1]; - ix += inc_x2; - iy += inc_y2; - i++; - - } - - } - - return (0); -} diff --git a/kernel/zarch/cdot.c b/kernel/zarch/cdot.c deleted file mode 100644 index d90f9c871..000000000 --- a/kernel/zarch/cdot.c +++ /dev/null @@ -1,176 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void cdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "verllg %%v22,%%v18,32\n\t" - "verllg %%v23,%%v19,32\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmasb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmasb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmasb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmasb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vrepg %%v26,%%v24,1\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vfasb %%v25,%%v25,%%v29\n\t" - "vfasb %%v25,%%v25,%%v31\n\t" - "vrepg %%v27,%%v25,1\n\t" - "vfasb %%v25,%%v25,%%v27\n\t" - "vstef %%v24,0(%[d]),0\n\t" - "vstef %%v24,4(%[d]),1\n\t" - "vstef %%v25,8(%[d]),1\n\t" - "vstef %%v25,12(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = { - 0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); - - } - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -16; - - if (n1) - cdot_kernel_16(n1, x, y, dot); - - i = n1; - BLASLONG j = i * 2; - - while (i < n) { - - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; - - j += 2; - i++; - - } - - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { - - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; - - ix += inc_x; - iy += inc_y; - i++; - - } - } - -#if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; -#else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; - -#endif - - return (result); - -} diff --git a/kernel/zarch/cgemv_n_4.c b/kernel/zarch/cgemv_n_4.c deleted file mode 100644 index 5c36bc338..000000000 --- a/kernel/zarch/cgemv_n_4.c +++ /dev/null @@ -1,752 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" - "vlrepg %%v18,16(%[x])\n\t" - "vlrepg %%v19,24(%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" -#else - "vlef %%v20,0(%[x]),1\n\t" - "vlef %%v20,0(%[x]),3\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,4(%[x]),0\n\t" - "vlef %%v20,4(%[x]),2\n\t" - "vlef %%v21,8(%[x]),1\n\t" - "vlef %%v21,8(%[x]),3\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,12(%[x]),0\n\t" - "vlef %%v21,12(%[x]),2\n\t" - "vlef %%v22,16(%[x]),1\n\t" - "vlef %%v22,16(%[x]),3\n\t" - "vflcsb %%v22,%%v22\n\t" - "vlef %%v22,20(%[x]),0\n\t" - "vlef %%v22,20(%[x]),2\n\t" - "vlef %%v23,24(%[x]),1\n\t" - "vlef %%v23,24(%[x]),3\n\t" - "vflcsb %%v23,%%v23\n\t" - "vlef %%v23,28(%[x]),0\n\t" - "vlef %%v23,28(%[x]),2\n\t" -#endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vperm %%v25,%%v24,%%v24,%%v2\n\t" - "vperm %%v24,%%v24,%%v24,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap1])\n\t" - "vperm %%v27,%%v26,%%v26,%%v2\n\t" - "vperm %%v26,%%v26,%%v26,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmasb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v27,%%v21,%%v0\n\t" - "vl %%v28,0(%%r1,%[ap2])\n\t" - "vperm %%v29,%%v28,%%v28,%%v2\n\t" - "vperm %%v28,%%v28,%%v28,%%v1\n\t" - "vl %%v30,0(%%r1,%[ap3])\n\t" - "vperm %%v31,%%v30,%%v30,%%v2\n\t" - "vperm %%v30,%%v30,%%v30,%%v1\n\t" - "vfmasb %%v0,%%v28,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v29,%%v22,%%v0\n\t" - "vfmasb %%v0,%%v30,%%v19,%%v0\n\t" - "vfmasb %%v0,%%v31,%%v23,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vlrepg %%v16,0(%[x])\n\t" - "vlrepg %%v17,8(%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" -#else - "vlef %%v18,0(%[x]),1\n\t" - "vlef %%v18,0(%[x]),3\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,4(%[x]),0\n\t" - "vlef %%v18,4(%[x]),2\n\t" - "vlef %%v19,8(%[x]),1\n\t" - "vlef %%v19,8(%[x]),3\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,12(%[x]),0\n\t" - "vlef %%v19,12(%[x]),2\n\t" -#endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v20,0(%%r1,%[ap0])\n\t" - "vperm %%v21,%%v20,%%v20,%%v2\n\t" - "vperm %%v20,%%v20,%%v20,%%v1\n\t" - "vl %%v22,0(%%r1,%[ap1])\n\t" - "vperm %%v23,%%v22,%%v22,%%v2\n\t" - "vperm %%v22,%%v22,%%v22,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmasb %%v0,%%v23,%%v19,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); -} - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vlrepg %%v16,0(%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" -#else - "vlef %%v17,0(%[x]),1\n\t" - "vlef %%v17,0(%[x]),3\n\t" - "vflcsb %%v17,%%v17\n\t" - "vlef %%v17,4(%[x]),0\n\t" - "vlef %%v17,4(%[x]),2\n\t" -#endif - "vleib %%v1,0,0\n\t" - "vleib %%v1,1,1\n\t" - "vleib %%v1,2,2\n\t" - "vleib %%v1,3,3\n\t" - "vleib %%v1,0,4\n\t" - "vleib %%v1,1,5\n\t" - "vleib %%v1,2,6\n\t" - "vleib %%v1,3,7\n\t" - "vleib %%v1,8,8\n\t" - "vleib %%v1,9,9\n\t" - "vleib %%v1,10,10\n\t" - "vleib %%v1,11,11\n\t" - "vleib %%v1,8,12\n\t" - "vleib %%v1,9,13\n\t" - "vleib %%v1,10,14\n\t" - "vleib %%v1,11,15\n\t" - "vleib %%v2,4,0\n\t" - "vleib %%v2,5,1\n\t" - "vleib %%v2,6,2\n\t" - "vleib %%v2,7,3\n\t" - "vleib %%v2,4,4\n\t" - "vleib %%v2,5,5\n\t" - "vleib %%v2,6,6\n\t" - "vleib %%v2,7,7\n\t" - "vleib %%v2,12,8\n\t" - "vleib %%v2,13,9\n\t" - "vleib %%v2,14,10\n\t" - "vleib %%v2,15,11\n\t" - "vleib %%v2,12,12\n\t" - "vleib %%v2,13,13\n\t" - "vleib %%v2,14,14\n\t" - "vleib %%v2,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v18,0(%%r1,%[ap])\n\t" - "vperm %%v19,%%v18,%%v18,%%v2\n\t" - "vperm %%v18,%%v18,%%v18,%%v1\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmasb %%v0,%%v19,%%v17,%%v0\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v16", "v17", "v18", "v19"); -} - -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, - FLOAT alpha_i) { - __asm__( -#if !defined(XCONJ) - "vlrepf %%v0,%[alpha_r]\n\t" - "vlef %%v1,%[alpha_i],0\n\t" - "vlef %%v1,%[alpha_i],2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,%[alpha_i],1\n\t" - "vlef %%v1,%[alpha_i],3\n\t" -#else - "vlef %%v0,%[alpha_r],1\n\t" - "vlef %%v0,%[alpha_r],3\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,%[alpha_r],0\n\t" - "vlef %%v0,%[alpha_r],2\n\t" - "vlrepf %%v1,%[alpha_i]\n\t" -#endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,0(%%r1,%[dest])\n\t" - "vl %%v19,16(%%r1,%[dest])\n\t" - "verllg %%v20,%%v16,32\n\t" - "verllg %%v21,%%v17,32\n\t" - "vfmasb %%v22,%%v16,%%v0,%%v18\n\t" - "vfmasb %%v23,%%v17,%%v0,%%v19\n\t" - "vfmasb %%v22,%%v20,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v21,%%v1,%%v23\n\t" - "vst %%v22,0(%%r1,%[dest])\n\t" - "vst %%v23,16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, - FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; - - if (inc_dest != 2) { - - FLOAT temp_r; - FLOAT temp_i; - for (i = 0; i < n; i++) { -#if !defined(XCONJ) - temp_r = alpha_r * src[0] - alpha_i * src[1]; - temp_i = alpha_r * src[1] + alpha_i * src[0]; -#else - temp_r = alpha_r * src[0] + alpha_i * src[1]; - temp_i = -alpha_r * src[1] + alpha_i * src[0]; -#endif - - *dest += temp_r; - *(dest + 1) += temp_i; - - src += 2; - dest += inc_dest; - } - return; - } - - add_y_4(n, src, dest, alpha_r, alpha_i); -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4; - FLOAT xbuffer[8], *ybuffer; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - ybuffer = buffer; - - inc_x *= 2; - inc_y *= 2; - lda *= 2; - lda4 = 4 * lda; - - n1 = n / 4; - n2 = n % 4; - - m3 = m % 4; - m1 = m - (m % 4); - m2 = (m % NBMAX) - (m % 4); - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } - - a_ptr = a; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - x_ptr = x; - //zero_y(NB,ybuffer); - memset(ybuffer, 0, NB * 8); - - if (inc_x == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, ap, x_ptr, ybuffer); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 8; - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, ap, x_ptr, ybuffer); - x_ptr += 4; - a_ptr += 2 * lda; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); - /* x_ptr += 2; - a_ptr += lda; */ - - } - } else { - - for (i = 0; i < n1; i++) { - - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - xbuffer[3] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - xbuffer[5] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - xbuffer[7] = x_ptr[1]; - x_ptr += inc_x; - - cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for (i = 0; i < n2; i++) { - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); - a_ptr += 1 * lda; - - } - - } - - add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); - a += 2 * NB; - y_ptr += NB * inc_y; - } - - if (m3 == 0) - return (0); - - if (m3 == 1) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r = 0.0; - FLOAT temp_i = 0.0; - - if (lda == 2 && inc_x == 2) { - - for (i = 0; i < (n & -2); i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; -#endif - - a_ptr += 4; - x_ptr += 4; - } - - for (; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; -#endif - - a_ptr += 2; - x_ptr += 2; - } - - } else { - - for (i = 0; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; -#else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; -#endif - - a_ptr += lda; - x_ptr += inc_x; - } - - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - return (0); - } - - if (m3 == 2) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i1 = 0.0; - - if (lda == 4 && inc_x == 2) { - - for (i = 0; i < (n & -2); i += 2) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - - temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; - temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; - temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; - temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; - -#else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - - temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; - temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; - temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; - -#endif - - a_ptr += 8; - x_ptr += 4; - } - - for (; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; -#else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; -#endif - - a_ptr += 4; - x_ptr += 2; - } - - } else { - - for (i = 0; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; -#else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; -#endif - - a_ptr += lda; - x_ptr += inc_x; - } - - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; -#else - y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; -#endif - return (0); - } - - if (m3 == 3) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_i2 = 0.0; - - if (lda == 6 && inc_x == 2) { - - for (i = 0; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; -#else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; -#endif - - a_ptr += 6; - x_ptr += 2; - } - - } else { - - for (i = 0; i < n; i++) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; -#else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; -#endif - - a_ptr += lda; - x_ptr += inc_x; - } - - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; - y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; -#else - y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; - y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; -#endif - return (0); - } - - return (0); -} diff --git a/kernel/zarch/cgemv_t_4.c b/kernel/zarch/cgemv_t_4.c deleted file mode 100644 index e10edfab0..000000000 --- a/kernel/zarch/cgemv_t_4.c +++ /dev/null @@ -1,724 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void cgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vzero %%v20\n\t" - "vzero %%v21\n\t" - "vzero %%v22\n\t" - "vzero %%v23\n\t" - "vleib %%v2,0,0\n\t" - "vleib %%v2,1,1\n\t" - "vleib %%v2,2,2\n\t" - "vleib %%v2,3,3\n\t" - "vleib %%v2,0,4\n\t" - "vleib %%v2,1,5\n\t" - "vleib %%v2,2,6\n\t" - "vleib %%v2,3,7\n\t" - "vleib %%v2,8,8\n\t" - "vleib %%v2,9,9\n\t" - "vleib %%v2,10,10\n\t" - "vleib %%v2,11,11\n\t" - "vleib %%v2,8,12\n\t" - "vleib %%v2,9,13\n\t" - "vleib %%v2,10,14\n\t" - "vleib %%v2,11,15\n\t" - "vleib %%v3,4,0\n\t" - "vleib %%v3,5,1\n\t" - "vleib %%v3,6,2\n\t" - "vleib %%v3,7,3\n\t" - "vleib %%v3,4,4\n\t" - "vleib %%v3,5,5\n\t" - "vleib %%v3,6,6\n\t" - "vleib %%v3,7,7\n\t" - "vleib %%v3,12,8\n\t" - "vleib %%v3,13,9\n\t" - "vleib %%v3,14,10\n\t" - "vleib %%v3,15,11\n\t" - "vleib %%v3,12,12\n\t" - "vleib %%v3,13,13\n\t" - "vleib %%v3,14,14\n\t" - "vleib %%v3,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" -#else - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" -#endif - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vperm %%v25,%%v24,%%v24,%%v3\n\t" - "vperm %%v24,%%v24,%%v24,%%v2\n\t" - "vl %%v26,0(%%r1,%[ap1])\n\t" - "vperm %%v27,%%v26,%%v26,%%v3\n\t" - "vperm %%v26,%%v26,%%v26,%%v2\n\t" - "vl %%v28,0(%%r1,%[ap2])\n\t" - "vperm %%v29,%%v28,%%v28,%%v3\n\t" - "vperm %%v28,%%v28,%%v28,%%v2\n\t" - "vl %%v30,0(%%r1,%[ap3])\n\t" - "vperm %%v31,%%v30,%%v30,%%v3\n\t" - "vperm %%v30,%%v30,%%v30,%%v2\n\t" - "vfmasb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmasb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmasb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmasb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmasb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmasb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmasb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v16,%%v16,%%v20\n\t" - "vfasb %%v17,%%v17,%%v21\n\t" - "vfasb %%v18,%%v18,%%v22\n\t" - "vfasb %%v19,%%v19,%%v23\n\t" - "vrepg %%v20,%%v16,1\n\t" - "vrepg %%v21,%%v17,1\n\t" - "vrepg %%v22,%%v18,1\n\t" - "vrepg %%v23,%%v19,1\n\t" - "vfasb %%v16,%%v16,%%v20\n\t" - "vfasb %%v17,%%v17,%%v21\n\t" - "vfasb %%v18,%%v18,%%v22\n\t" - "vfasb %%v19,%%v19,%%v23\n\t" - "vmrhg %%v16,%%v16,%%v17\n\t" - "vmrhg %%v17,%%v18,%%v19\n\t" - "verllg %%v18,%%v16,32\n\t" - "verllg %%v19,%%v17,32\n\t" -#if !defined(XCONJ) - "vlrepf %%v20,0(%[alpha])\n\t" - "vlef %%v21,4(%[alpha]),0\n\t" - "vlef %%v21,4(%[alpha]),2\n\t" - "vflcsb %%v21,%%v21\n\t" - "vlef %%v21,4(%[alpha]),1\n\t" - "vlef %%v21,4(%[alpha]),3\n\t" -#else - "vlef %%v20,0(%[alpha]),1\n\t" - "vlef %%v20,0(%[alpha]),3\n\t" - "vflcsb %%v20,%%v20\n\t" - "vlef %%v20,0(%[alpha]),0\n\t" - "vlef %%v20,0(%[alpha]),2\n\t" - "vlrepf %%v21,4(%[alpha])\n\t" -#endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" - "vfmasb %%v22,%%v16,%%v20,%%v22\n\t" - "vfmasb %%v22,%%v18,%%v21,%%v22\n\t" - "vfmasb %%v23,%%v17,%%v20,%%v23\n\t" - "vfmasb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -static void cgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vleib %%v2,0,0\n\t" - "vleib %%v2,1,1\n\t" - "vleib %%v2,2,2\n\t" - "vleib %%v2,3,3\n\t" - "vleib %%v2,0,4\n\t" - "vleib %%v2,1,5\n\t" - "vleib %%v2,2,6\n\t" - "vleib %%v2,3,7\n\t" - "vleib %%v2,8,8\n\t" - "vleib %%v2,9,9\n\t" - "vleib %%v2,10,10\n\t" - "vleib %%v2,11,11\n\t" - "vleib %%v2,8,12\n\t" - "vleib %%v2,9,13\n\t" - "vleib %%v2,10,14\n\t" - "vleib %%v2,11,15\n\t" - "vleib %%v3,4,0\n\t" - "vleib %%v3,5,1\n\t" - "vleib %%v3,6,2\n\t" - "vleib %%v3,7,3\n\t" - "vleib %%v3,4,4\n\t" - "vleib %%v3,5,5\n\t" - "vleib %%v3,6,6\n\t" - "vleib %%v3,7,7\n\t" - "vleib %%v3,12,8\n\t" - "vleib %%v3,13,9\n\t" - "vleib %%v3,14,10\n\t" - "vleib %%v3,15,11\n\t" - "vleib %%v3,12,12\n\t" - "vleib %%v3,13,13\n\t" - "vleib %%v3,14,14\n\t" - "vleib %%v3,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" -#else - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" -#endif - "vl %%v20,0(%%r1,%[ap0])\n\t" - "vperm %%v21,%%v20,%%v20,%%v3\n\t" - "vperm %%v20,%%v20,%%v20,%%v2\n\t" - "vl %%v22,0(%%r1,%[ap1])\n\t" - "vperm %%v23,%%v22,%%v22,%%v3\n\t" - "vperm %%v22,%%v22,%%v22,%%v2\n\t" - "vfmasb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmasb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmasb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmasb %%v19,%%v23,%%v1,%%v19\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v16,%%v16,%%v18\n\t" - "vfasb %%v17,%%v17,%%v19\n\t" - "vrepg %%v18,%%v16,1\n\t" - "vrepg %%v19,%%v17,1\n\t" - "vfasb %%v16,%%v16,%%v18\n\t" - "vfasb %%v17,%%v17,%%v19\n\t" - "vmrhg %%v16,%%v16,%%v17\n\t" - "verllg %%v17,%%v16,32\n\t" -#if !defined(XCONJ) - "vlrepf %%v18,0(%[alpha])\n\t" - "vlef %%v19,4(%[alpha]),0\n\t" - "vlef %%v19,4(%[alpha]),2\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,4(%[alpha]),1\n\t" - "vlef %%v19,4(%[alpha]),3\n\t" -#else - "vlef %%v18,0(%[alpha]),1\n\t" - "vlef %%v18,0(%[alpha]),3\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[alpha]),0\n\t" - "vlef %%v18,0(%[alpha]),2\n\t" - "vlrepf %%v19,4(%[alpha])\n\t" -#endif - "vl %%v20,0(%[y])\n\t" - "vfmasb %%v20,%%v16,%%v18,%%v20\n\t" - "vfmasb %%v20,%%v17,%%v19,%%v20\n\t" - "vst %%v20,0(%[y])" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23"); -} - -static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vleib %%v2,0,0\n\t" - "vleib %%v2,1,1\n\t" - "vleib %%v2,2,2\n\t" - "vleib %%v2,3,3\n\t" - "vleib %%v2,0,4\n\t" - "vleib %%v2,1,5\n\t" - "vleib %%v2,2,6\n\t" - "vleib %%v2,3,7\n\t" - "vleib %%v2,8,8\n\t" - "vleib %%v2,9,9\n\t" - "vleib %%v2,10,10\n\t" - "vleib %%v2,11,11\n\t" - "vleib %%v2,8,12\n\t" - "vleib %%v2,9,13\n\t" - "vleib %%v2,10,14\n\t" - "vleib %%v2,11,15\n\t" - "vleib %%v3,4,0\n\t" - "vleib %%v3,5,1\n\t" - "vleib %%v3,6,2\n\t" - "vleib %%v3,7,3\n\t" - "vleib %%v3,4,4\n\t" - "vleib %%v3,5,5\n\t" - "vleib %%v3,6,6\n\t" - "vleib %%v3,7,7\n\t" - "vleib %%v3,12,8\n\t" - "vleib %%v3,13,9\n\t" - "vleib %%v3,14,10\n\t" - "vleib %%v3,15,11\n\t" - "vleib %%v3,12,12\n\t" - "vleib %%v3,13,13\n\t" - "vleib %%v3,14,14\n\t" - "vleib %%v3,15,15\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" -#else - "vlef %%v1,0(%%r1,%[x]),1\n\t" - "vlef %%v1,8(%%r1,%[x]),3\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%%r1,%[x]),0\n\t" - "vlef %%v1,12(%%r1,%[x]),2\n\t" -#endif - "vl %%v18,0(%%r1,%[ap])\n\t" - "vperm %%v19,%%v18,%%v18,%%v3\n\t" - "vperm %%v18,%%v18,%%v18,%%v2\n\t" - "vfmasb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmasb %%v17,%%v19,%%v1,%%v17\n\t" - "agfi %%r1,16\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vrepg %%v17,%%v16,1\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "verllg %%v17,%%v16,32\n\t" -#if !defined(XCONJ) - "vlrepf %%v18,0(%[alpha])\n\t" - "vlef %%v19,4(%[alpha]),0\n\t" - "vflcsb %%v19,%%v19\n\t" - "vlef %%v19,4(%[alpha]),1\n\t" -#else - "vlef %%v18,0(%[alpha]),1\n\t" - "vflcsb %%v18,%%v18\n\t" - "vlef %%v18,0(%[alpha]),0\n\t" - "vlrepf %%v19,4(%[alpha])\n\t" -#endif - "vleg %%v0,0(%[y]),0\n\t" - "vfmasb %%v0,%%v16,%%v18,%%v0\n\t" - "vfmasb %%v0,%%v17,%%v19,%%v0\n\t" - "vsteg %%v0,0(%[y]),0" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19"); -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[8]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4; - FLOAT ybuffer[8], *xbuffer; - FLOAT alpha[2]; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - lda4 = lda << 2; - - xbuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; - - alpha[0] = alpha_r; - alpha[1] = alpha_i; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - cgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if (n2 & 2) { - cgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if (n2 & 1) { - cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof(ybuffer)); - cgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof(ybuffer)); - cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } - - } - a += 2 * NB; - x += NB * inc_x; - } - - if (m3 == 0) - return (0); - - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; -#endif - -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; -#else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; -#else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; -#endif - -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; -#else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - - return (0); - } - - if (m3 == 1) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while (j < (n & -2)) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; -#else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } - - while (j < n) { -#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; -#else - - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; -#endif - -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; -#else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; -#endif - - a_ptr += lda; - y_ptr += inc_y; - j++; - } - return (0); - } - - return (0); -} diff --git a/kernel/zarch/crot.c b/kernel/zarch/crot.c deleted file mode 100644 index aab155f8b..000000000 --- a/kernel/zarch/crot.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void crot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT c, FLOAT s) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - FLOAT cosa, sina; - cosa = c; - sina = s; - crot_kernel_32(n1, x, y, &cosa, &sina); - i = n1; - ix = 2 * n1; - } - - while (i < n) { - temp[0] = c * x[ix] + s * y[ix]; - temp[1] = c * x[ix + 1] + s * y[ix + 1]; - y[ix] = c * y[ix] - s * x[ix]; - y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; - x[ix] = temp[0]; - x[ix + 1] = temp[1]; - - ix += 2; - i++; - - } - - } else { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - while (i < n) { - temp[0] = c * x[ix] + s * y[iy]; - temp[1] = c * x[ix + 1] + s * y[iy + 1]; - y[iy] = c * y[iy] - s * x[ix]; - y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; - x[ix] = temp[0]; - x[ix + 1] = temp[1]; - - ix += inc_x2; - iy += inc_y2; - i++; - - } - - } - return (0); - -} diff --git a/kernel/zarch/cscal.c b/kernel/zarch/cscal.c deleted file mode 100644 index 9fc54cf29..000000000 --- a/kernel/zarch/cscal.c +++ /dev/null @@ -1,429 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void cscal_kernel_16(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "vlef %%v1,4(%[alpha]),0\n\t" - "vlef %%v1,4(%[alpha]),2\n\t" - "vflcsb %%v1,%%v1\n\t" - "vlef %%v1,4(%[alpha]),1\n\t" - "vlef %%v1,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v24,%%v16,32\n\t" - "verllg %%v25,%%v17,32\n\t" - "verllg %%v26,%%v18,32\n\t" - "verllg %%v27,%%v19,32\n\t" - "verllg %%v28,%%v20,32\n\t" - "verllg %%v29,%%v21,32\n\t" - "verllg %%v30,%%v22,32\n\t" - "verllg %%v31,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vfmasb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmasb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmasb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmasb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmasb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmasb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmasb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmasb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -static void cscal_kernel_16_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlef %%v0,4(%[alpha]),0\n\t" - "vlef %%v0,4(%[alpha]),2\n\t" - "vflcsb %%v0,%%v0\n\t" - "vlef %%v0,4(%[alpha]),1\n\t" - "vlef %%v0,4(%[alpha]),3\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "verllg %%v16,%%v16,32\n\t" - "verllg %%v17,%%v17,32\n\t" - "verllg %%v18,%%v18,32\n\t" - "verllg %%v19,%%v19,32\n\t" - "verllg %%v20,%%v20,32\n\t" - "verllg %%v21,%%v21,32\n\t" - "verllg %%v22,%%v22,32\n\t" - "verllg %%v23,%%v23,32\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); -} - -static void cscal_kernel_16_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepf %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmsb %%v16,%%v16,%%v0\n\t" - "vfmsb %%v17,%%v17,%%v0\n\t" - "vfmsb %%v18,%%v18,%%v0\n\t" - "vfmsb %%v19,%%v19,%%v0\n\t" - "vfmsb %%v20,%%v20,%%v0\n\t" - "vfmsb %%v21,%%v21,%%v0\n\t" - "vfmsb %%v22,%%v22,%%v0\n\t" - "vfmsb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); -} - -static void cscal_kernel_16_zero(BLASLONG n, FLOAT *x) { - __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); -} - -static void cscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, - BLASLONG inc_x) { - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, - FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - - if (inc_x != 1) { - inc_x <<= 1; - - if (da_r == 0.0) { - - BLASLONG n1 = n & -2; - - if (da_i == 0.0) { - - while (j < n1) { - - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; - - } - - while (j < n) { - - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; - - } - - } else { - - while (j < n1) { - - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; - - } - - while (j < n) { - - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; - - } - - } - - } else { - - if (da_i == 0.0) { - BLASLONG n1 = n & -2; - - while (j < n1) { - - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; - - } - - while (j < n) { - - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; - - } - - } else { - - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - cscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } - - while (j < n) { - - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; - - } - - } - - } - - return (0); - } - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - alpha[0] = da_r; - alpha[1] = da_i; - - if (da_r == 0.0) - if (da_i == 0) - cscal_kernel_16_zero(n1, x); - else - cscal_kernel_16_zero_r(n1, alpha, x); - else if (da_i == 0) - cscal_kernel_16_zero_i(n1, alpha, x); - else - cscal_kernel_16(n1, alpha, x); - - i = n1 << 1; - j = n1; - } - - if (da_r == 0.0) { - - if (da_i == 0.0) { - - while (j < n) { - - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; - - } - - } else { - - while (j < n) { - - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; - - } - - } - - } else { - - if (da_i == 0.0) { - - while (j < n) { - - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; - - } - - } else { - - while (j < n) { - - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; - - } - - } - - } - - return (0); -} diff --git a/kernel/zarch/csum.c b/kernel/zarch/csum.c deleted file mode 100644 index c0b8c6371..000000000 --- a/kernel/zarch/csum.c +++ /dev/null @@ -1,137 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -static FLOAT csum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT sum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return sum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ip = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (sumf); - - if (inc_x == 1) { - - n1 = n & -32; - if (n1 > 0) { - - sumf = csum_kernel_32(n1, x); - i = n1; - ip = 2 * n1; - } - - while (i < n) { - sumf += x[ip] + x[ip + 1]; - i++; - ip += 2; - } - - } else { - inc_x2 = 2 * inc_x; - - while (i < n) { - sumf += x[ip] + x[ip + 1]; - ip += inc_x2; - i++; - } - - } - return (sumf); -} diff --git a/kernel/zarch/cswap.c b/kernel/zarch/cswap.c deleted file mode 100644 index 198994e18..000000000 --- a/kernel/zarch/cswap.c +++ /dev/null @@ -1,169 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void cswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, - FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - cswap_kernel_32(n1, x, y); - i = n1; - ix = 2 * n1; - iy = 2 * n1; - } - - while (i < n) { - - temp[0] = x[ix]; - temp[1] = x[ix + 1]; - x[ix] = y[iy]; - x[ix + 1] = y[iy + 1]; - y[iy] = temp[0]; - y[iy + 1] = temp[1]; - - ix += 2; - iy += 2; - i++; - - } - - } else { - - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - - while (i < n) { - - temp[0] = x[ix]; - temp[1] = x[ix + 1]; - x[ix] = y[iy]; - x[ix + 1] = y[iy + 1]; - y[iy] = temp[0]; - y[iy + 1] = temp[1]; - - ix += inc_x2; - iy += inc_y2; - i++; - - } - - } - return (0); - -} diff --git a/kernel/zarch/damax.c b/kernel/zarch/damax.c deleted file mode 100644 index caacb50dc..000000000 --- a/kernel/zarch/damax.c +++ /dev/null @@ -1,150 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabs - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxdb %%v16,%%v16,%%v24,8\n\t" - "vfmaxdb %%v17,%%v17,%%v25,8\n\t" - "vfmaxdb %%v18,%%v18,%%v26,8\n\t" - "vfmaxdb %%v19,%%v19,%%v27,8\n\t" - "vfmaxdb %%v20,%%v20,%%v28,8\n\t" - "vfmaxdb %%v21,%%v21,%%v29,8\n\t" - "vfmaxdb %%v22,%%v22,%%v30,8\n\t" - "vfmaxdb %%v23,%%v23,%%v31,8\n\t" - "vfmaxdb %%v16,%%v16,%%v20,8\n\t" - "vfmaxdb %%v17,%%v17,%%v21,8\n\t" - "vfmaxdb %%v18,%%v18,%%v22,8\n\t" - "vfmaxdb %%v19,%%v19,%%v23,8\n\t" - "vfmaxdb %%v16,%%v16,%%v18,8\n\t" - "vfmaxdb %%v17,%%v17,%%v19,8\n\t" - "vfmaxdb %%v16,%%v16,%%v17,8\n\t" - "vfmaxdb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = damax_kernel_32(n1, x); - - i = n1; - } else { - maxf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); - - } else { - - maxf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/damax_z13.c b/kernel/zarch/damax_z13.c deleted file mode 100644 index f3db4c108..000000000 --- a/kernel/zarch/damax_z13.c +++ /dev/null @@ -1,184 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabs - -static FLOAT damax_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = damax_kernel_32(n1, x); - - i = n1; - } else { - maxf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); - - } else { - - maxf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/damin.c b/kernel/zarch/damin.c deleted file mode 100644 index 0163a144b..000000000 --- a/kernel/zarch/damin.c +++ /dev/null @@ -1,150 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabs - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,8\n\t" - "vfmindb %%v17,%%v17,%%v25,8\n\t" - "vfmindb %%v18,%%v18,%%v26,8\n\t" - "vfmindb %%v19,%%v19,%%v27,8\n\t" - "vfmindb %%v20,%%v20,%%v28,8\n\t" - "vfmindb %%v21,%%v21,%%v29,8\n\t" - "vfmindb %%v22,%%v22,%%v30,8\n\t" - "vfmindb %%v23,%%v23,%%v31,8\n\t" - "vfmindb %%v16,%%v16,%%v20,8\n\t" - "vfmindb %%v17,%%v17,%%v21,8\n\t" - "vfmindb %%v18,%%v18,%%v22,8\n\t" - "vfmindb %%v19,%%v19,%%v23,8\n\t" - "vfmindb %%v16,%%v16,%%v18,8\n\t" - "vfmindb %%v17,%%v17,%%v19,8\n\t" - "vfmindb %%v16,%%v16,%%v17,8\n\t" - "vfmindb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,8\n\t" - "lpdr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = damin_kernel_32(n1, x); - - i = n1; - } else { - minf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); - - } else { - - minf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/damin_z13.c b/kernel/zarch/damin_z13.c deleted file mode 100644 index 4196b2e15..000000000 --- a/kernel/zarch/damin_z13.c +++ /dev/null @@ -1,184 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabs - -static FLOAT damin_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = damin_kernel_32(n1, x); - - i = n1; - } else { - minf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); - - } else { - - minf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/dasum.c b/kernel/zarch/dasum.c index aa1382b10..7a42a0863 100644 --- a/kernel/zarch/dasum.c +++ b/kernel/zarch/dasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,144 +23,142 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ + #include "common.h" #include -#define ABS fabs - -static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT asum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return asum; +#if defined(DOUBLE) +#define ABS fabs +#else +#define ABS fabsf +#endif + + + + +static FLOAT dasum_kernel_32(BLASLONG n, FLOAT *x) { + FLOAT asum ; + __asm__ ( + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v2 \n\t" + "vzero %%v3 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_temp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_temp] ) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v2,%%v2,%%v26 \n\t" + "vfadb %%v3,%%v3,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v2,%%v2,%%v30 \n\t" + "vfadb %%v3,%%v3,%%v31 \n\t" + + "vlm %%v24,%%v31, 128(%[ptr_temp]) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "la %[ptr_temp],256(%[ptr_temp]) \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v2,%%v2,%%v26 \n\t" + "vfadb %%v3,%%v3,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v2,%%v2,%%v30 \n\t" + "vfadb %%v3,%%v3,%%v31 \n\t" + + "clgrjl %[ptr_temp],%%r0,1b \n\t" + "vfadb %%v24,%%v0,%%v1 \n\t" + "vfadb %%v25,%%v2,%%v3 \n\t" + "vfadb %%v0,%%v25,%%v24 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %[asum],%%f0 \n\t" + : [asum] "=f"(asum),[ptr_temp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x ), [n] "r"(n), [ptr_x] "a"(x) + : "cc", "r0" ,"f0","f1","v0","v1","v2","v3","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return asum; + } + + + FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; + BLASLONG i = 0; + BLASLONG j = 0; + FLOAT sumf = 0.0; + BLASLONG n1; - if (n <= 0 || inc_x <= 0) - return sumf; + if (n <= 0 || inc_x <= 0) return sumf; - if (inc_x == 1) { + if (inc_x == 1) { - n1 = n & -32; + n1 = n & -32; + + if (n1 > 0) { - if (n1 > 0) { + sumf = dasum_kernel_32(n1, x); + i = n1; + } - sumf = dasum_kernel_32(n1, x); - i = n1; - } + while (i < n) { + sumf += ABS(x[i]); + i++; + } - while (i < n) { - sumf += ABS(x[i]); - i++; - } + } else { + BLASLONG n1 = n & -4; + register FLOAT sum1, sum2; + sum1 = 0.0; + sum2 = 0.0; + while (j < n1) { - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { + sum1 += ABS(x[i]); + sum2 += ABS(x[i + inc_x]); + sum1 += ABS(x[i + 2 * inc_x]); + sum2 += ABS(x[i + 3 * inc_x]); - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); + i += inc_x * 4; + j += 4; - i += inc_x * 4; - j += 4; + } + sumf = sum1 + sum2; + while (j < n) { - } - sumf = sum1 + sum2; - while (j < n) { + sumf += ABS(x[i]); + i += inc_x; + j++; + } - sumf += ABS(x[i]); - i += inc_x; - j++; - } - } - return sumf; + } + return sumf; } + + diff --git a/kernel/zarch/daxpy.c b/kernel/zarch/daxpy.c index 5b0208c20..16f82a587 100644 --- a/kernel/zarch/daxpy.c +++ b/kernel/zarch/daxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,143 +25,159 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - __asm__("vlrepg %%v0,%[alpha]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmadb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmadb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmadb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmadb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmadb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmadb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmadb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmadb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - [alpha] "Q"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +#define PREFETCH_INS 1 +#if defined(Z13_A) +#include + +static void daxpy_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG i = 0; + __vector double v_a = {alpha,alpha}; + __vector double * v_y=(__vector double *)y; + __vector double * v_x=(__vector double *)x; + + for(; i 0) { - dcopy_kernel_32(n1, x, y); - i = n1; - } + BLASLONG n1 = n & -32; + if (n1 > 0) { + dcopy_kernel_32(n1, x, y); + i = n1; + } - while (i < n) { - y[i] = x[i]; - i++; + while (i < n) { + y[i] = x[i]; + i++; - } + } + + + } else { + + BLASLONG n1 = n & -4; - } else { + while (i < n1) { - while (i < n) { + y[iy] = x[ix]; + y[iy + inc_y] = x[ix + inc_x]; + y[iy + 2 * inc_y] = x[ix + 2 * inc_x]; + y[iy + 3 * inc_y] = x[ix + 3 * inc_x]; - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x * 4; + iy += inc_y * 4; + i += 4; + + } + + while (i < n) { + + y[iy] = x[ix]; + ix += inc_x; + iy += inc_y; + i++; + + } } + return 0; - } - return 0; } + + diff --git a/kernel/zarch/ddot.c b/kernel/zarch/ddot.c index 9cad68f4b..c70cbd00d 100644 --- a/kernel/zarch/ddot.c +++ b/kernel/zarch/ddot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,129 +25,184 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - FLOAT dot; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return dot; + +#if defined(Z13) +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + FLOAT dot; + __asm__ volatile( + "pfd 1, 0(%[ptr_x_tmp]) \n\t" + "pfd 1, 0(%[ptr_y_tmp]) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" + "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" + + "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + + "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + + "vl %%v16, 64(%%r1 ,%[ptr_x_tmp]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 112(%%r1,%[ptr_x_tmp]) \n\t" + + "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v25,%%v17,%%v29,%%v25 \n\t" + + + "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v26,%%v18,%%v30,%%v26 \n\t" + "vl %%v31, 112(%%r1,%[ptr_y_tmp]) \n\t" + "vfmadb %%v27,%%v19,%%v31,%%v27 \n\t" + + + "la %%r1,128(%%r1) \n\t" + "brctg %[n_tmp],1b \n\t" + "vfadb %%v24,%%v25,%%v24 \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v24,%%v27,%%v24 \n\t" + "vrepg %%v1,%%v24,1 \n\t" + "vfadb %%v1,%%v24,%%v1 \n\t" + "ldr %[dot], %%f1 \n\t" + : [dot] "=f"(dot) ,[n_tmp] "+&r"(n) + : [mem_x] "m"( *(const double (*)[n])x), + [mem_y] "m"( *(const double (*)[n])y), + [ptr_x_tmp]"a"(x), [ptr_y_tmp] "a"(y) + :"cc" , "r1","f1","v16", "v17","v18","v19","v20","v21","v22","v23", + "v24","v25","v26","v27","v28","v29","v30","v31" + + ); + return dot; + } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT dot = 0.0; +#else + +static FLOAT ddot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y ) +{ + BLASLONG register i = 0; + FLOAT dot = 0.0; + + while(i < n) + { + dot += y[i] * x[i] + + y[i+1] * x[i+1] + + y[i+2] * x[i+2] + + y[i+3] * x[i+3] + + y[i+4] * x[i+4] + + y[i+5] * x[i+5] + + y[i+6] * x[i+6] + + y[i+7] * x[i+7] ; + dot += y[i+8] * x[i+8] + + y[i+9] * x[i+9] + + y[i+10] * x[i+10] + + y[i+11] * x[i+11] + + y[i+12] * x[i+12] + + y[i+13] * x[i+13] + + y[i+14] * x[i+14] + + y[i+15] * x[i+15] ; + + + i+=16 ; + + } + return dot; + +} - if (n <= 0) - return (dot); +#endif - if ((inc_x == 1) && (inc_y == 1)) { +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; - BLASLONG n1 = n & -16; + FLOAT dot = 0.0 ; - if (n1) - dot = ddot_kernel_16(n1, x, y); + if ( n <= 0 ) return(dot); - i = n1; - while (i < n) { + if ( (inc_x == 1) && (inc_y == 1) ) + { - dot += y[i] * x[i]; - i++; + BLASLONG n1 = n & -16; + + if ( n1 ){ + dot = ddot_kernel_16(n1, x, y ); + i = n1; + } - } - return (dot); + + while(i < n) + { - } + dot += y[i] * x[i] ; + i++ ; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; + } + return(dot); - BLASLONG n1 = n & -4; - while (i < n1) { + } - FLOAT m1 = y[iy] * x[ix]; - FLOAT m2 = y[iy + inc_y] * x[ix + inc_x]; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; - FLOAT m3 = y[iy + 2 * inc_y] * x[ix + 2 * inc_x]; - FLOAT m4 = y[iy + 3 * inc_y] * x[ix + 3 * inc_x]; + BLASLONG n1 = n & -4; - ix += inc_x * 4; - iy += inc_y * 4; + while(i < n1) + { - temp1 += m1 + m3; - temp2 += m2 + m4; + FLOAT m1 = y[iy] * x[ix] ; + FLOAT m2 = y[iy+inc_y] * x[ix+inc_x] ; - i += 4; + FLOAT m3 = y[iy+2*inc_y] * x[ix+2*inc_x] ; + FLOAT m4 = y[iy+3*inc_y] * x[ix+3*inc_x] ; - } + ix += inc_x*4 ; + iy += inc_y*4 ; - while (i < n) { + temp1 += m1+m3; + temp2 += m2+m4; - temp1 += y[iy] * x[ix]; - ix += inc_x; - iy += inc_y; - i++; + i+=4 ; - } - dot = temp1 + temp2; - return (dot); + } + + while(i < n) + { + + temp1 += y[iy] * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + dot = temp1 + temp2; + return(dot); } + + diff --git a/kernel/zarch/dgemv_n_4.c b/kernel/zarch/dgemv_n_4.c index 502ba837e..bb202e754 100644 --- a/kernel/zarch/dgemv_n_4.c +++ b/kernel/zarch/dgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,593 +25,461 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" #define NBMAX 2048 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,16(%[x])\n\t" - "vlrepg %%v3,24(%[x])\n\t" - "vlrepg %%v4,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v4\n\t" - "vfmdb %%v1,%%v1,%%v4\n\t" - "vfmdb %%v2,%%v2,%%v4\n\t" - "vfmdb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmadb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmadb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmadb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vfmadb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmadb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmadb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmadb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v3,%%v5\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) + #include +#endif + +#ifdef HAVE_KERNEL_4x4 + +#elif HAVE_KERNEL_4x4_VEC + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1,x2,x3; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + x2 = xo[2] * *alpha; + x3 = xo[3] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double v_x1 = {x1,x1}; + __vector double v_x2 = {x2,x2}; + __vector double v_x3 = {x3,x3}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* va2 = (__vector double*)ap[2]; + __vector double* va3 = (__vector double*)ap[3]; + + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] + v_x2 * va2[i] + v_x3 * va3[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] + v_x2 * va2[i+1] + v_x3 * va3[i+1] ; + } } -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v1,8(%[x])\n\t" - "vlrepg %%v2,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v2\n\t" - "vfmdb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmadb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmadb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmadb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmadb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmadb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmadb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmadb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmadb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmadb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmadb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vfmadb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmadb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmadb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmadb %%v3,%%v19,%%v1,%%v3\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); +#else + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + + for ( i=0; i<4; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; + } } -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - __asm__("vlrepg %%v0,0(%[x])\n\t" - "vlrepg %%v16,%[alpha]\n\t" - "vfmdb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,0(%%r1,%[y])\n\t" - "vl %%v19,16(%%r1,%[y])\n\t" - "vfmadb %%v18,%%v16,%%v0,%%v18\n\t" - "vfmadb %%v19,%%v17,%%v0,%%v19\n\t" - "vst %%v18,0(%%r1,%[y])\n\t" - "vst %%v19,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest += src[i]; - dest += inc_dest; - } -} +#endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, - BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *buffer) { - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8], *ybuffer; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - ybuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m & -4; - m2 = (m & (NBMAX - 1)) - m3; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } +#ifdef HAVE_KERNEL_4x2 + +#elif HAVE_KERNEL_4x2_VEC - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if (inc_y != 1) - memset(ybuffer, 0, NB * 8); - else - ybuffer = y_ptr; - - if (inc_x == 1) { - - for (i = 0; i < n1; i++) { - dgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if (n2 & 2) { - dgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); - a_ptr += lda * 2; - x_ptr += 2; - } - - if (n2 & 1) { - dgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - } else { - - for (i = 0; i < n1; i++) { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for (i = 0; i < n2; i++) { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); - a_ptr += lda; - - } +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT x0,x1; + x0 = xo[0] * *alpha; + x1 = xo[1] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double v_x1 = {x1,x1}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] + v_x1 * va1[i] ; + v_y[i+1] += v_x0 * va0[i+1] + v_x1 * va1[i+1] ; + } +} +#else + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0,*a1; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap[0]; + a1 = ap[1]; + + for ( i=0; i<2; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0] + a1[i]*x[1]; + y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1]; + y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1]; + y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1]; } +} - a += NB; - if (inc_y != 1) { - add_y(NB, ybuffer, y_ptr, inc_y); - y_ptr += NB * inc_y; - } else - y_ptr += NB; - } +#endif - if (m3 == 0) - return (0); +#ifdef HAVE_KERNEL_4x1 - if (m3 == 3) { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if (lda == 3 && inc_x == 1) { +#elif HAVE_KERNEL_4x1_VEC +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + + BLASLONG i; + FLOAT x0; + x0 = xo[0] * *alpha; + __vector double v_x0 = {x0,x0}; + __vector double* v_y =(__vector double*)y; + __vector double* va0 = (__vector double*)ap; - for (i = 0; i < (n & -4); i += 4) { + for ( i=0; i< n/2; i+=2 ) + { + v_y[i] += v_x0 * va0[i] ; + v_y[i+1] += v_x0 * va0[i+1] ; + } + + +} - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; +#else +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) +{ + BLASLONG i; + FLOAT *a0; + FLOAT x[4] __attribute__ ((aligned (16))); + a0 = ap; + + for ( i=0; i<1; i++) + x[i] = xo[i] * *alpha; + + for ( i=0; i< n; i+=4 ) + { + y[i] += a0[i]*x[0]; + y[i+1] += a0[i+1]*x[0]; + y[i+2] += a0[i+2]*x[0]; + y[i+3] += a0[i+3]*x[0]; + } +} - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - a_ptr += 12; - x_ptr += 4; - } +#endif - for (; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr++; - } + - } else { +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) +{ + BLASLONG i; + + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; + } + + a_ptr = a; + x_ptr = x; + + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( inc_y != 1 ) + memset(ybuffer,0,NB*8); + else + ybuffer = y_ptr; + + if ( inc_x == 1 ) + { + + + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + x_ptr += 4; + } + + if ( n2 & 2 ) + { + dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); + a_ptr += lda*2; + x_ptr += 2; + } + + + if ( n2 & 1 ) + { + dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); + a_ptr += lda; + x_ptr += 1; + + } + + + } + else + { + + for( i = 0; i < n1 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[1] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + x_ptr += inc_x; + xbuffer[3] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); + ap[0] += lda4; + ap[1] += lda4; + ap[2] += lda4; + ap[3] += lda4; + a_ptr += lda4; + } + + for( i = 0; i < n2 ; i++) + { + xbuffer[0] = x_ptr[0]; + x_ptr += inc_x; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); + a_ptr += lda; + + } + + } + + a += NB; + if ( inc_y != 1 ) + { + add_y(NB,ybuffer,y_ptr,inc_y); + y_ptr += NB * inc_y; + } + else + y_ptr += NB ; - } + } + if ( m3 == 0 ) return(0); + + if ( m3 == 3 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + if ( lda == 3 && inc_x ==1 ) + { + + for( i = 0; i < ( n & -4 ); i+=4 ) + { + + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; + temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + + temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; + temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; + temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; + + a_ptr += 12; + x_ptr += 4; + } + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += 3; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + temp2 += a_ptr[2] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + y_ptr += inc_y; + y_ptr[0] += alpha * temp2; + return(0); } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return (0); - } - - if (m3 == 2) { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if (lda == 2 && inc_x == 1) { - - for (i = 0; i < (n & -4); i += 4) { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - for (; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr++; - } - - } else { - - for (i = 0; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - } + + if ( m3 == 2 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + if ( lda == 2 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4) ; i+=4 ) + { + temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; + temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; + temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + a_ptr += 8; + x_ptr += 4; + + } + + + for( ; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += 2; + x_ptr ++; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp0 += a_ptr[0] * x_ptr[0]; + temp1 += a_ptr[1] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + + + } + + } + y_ptr[0] += alpha * temp0; + y_ptr += inc_y; + y_ptr[0] += alpha * temp1; + return(0); } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return (0); - } - - if (m3 == 1) { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if (lda == 1 && inc_x == 1) { - - for (i = 0; i < (n & -4); i += 4) { - temp += - a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + - 2] * - x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; - - } - - for (; i < n; i++) { - temp += a_ptr[i] * x_ptr[i]; - } - - } else { - - for (i = 0; i < n; i++) { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } + if ( m3 == 1 ) + { + a_ptr = a; + x_ptr = x; + FLOAT temp = 0.0; + if ( lda == 1 && inc_x ==1 ) + { + + for( i = 0; i < (n & -4); i+=4 ) + { + temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; + + } + + for( ; i < n; i++ ) + { + temp += a_ptr[i] * x_ptr[i]; + } + + } + else + { + + for( i = 0; i < n; i++ ) + { + temp += a_ptr[0] * x_ptr[0]; + a_ptr += lda; + x_ptr += inc_x; + } + + } + y_ptr[0] += alpha * temp; + return(0); } - y_ptr[0] += alpha * temp; - return (0); - } - return (0); + + return(0); } + + diff --git a/kernel/zarch/dgemv_t_4.c b/kernel/zarch/dgemv_t_4.c index de72a1798..96af0139c 100644 --- a/kernel/zarch/dgemv_t_4.c +++ b/kernel/zarch/dgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,736 +25,517 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -#define NBMAX 2048 +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 -static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmadb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmadb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmadb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmadb %%v7,%%v17,%%v31,%%v7\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v2,%%v2,%%v6\n\t" - "vfadb %%v3,%%v3,%%v7\n\t" - "vrepg %%v4,%%v0,1\n\t" - "adbr %%f0,%%f4\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v4,%%v1,1\n\t" - "adbr %%f1,%%f4\n\t" - "std %%f1,8(%[y])\n\t" - "vrepg %%v4,%%v2,1\n\t" - "adbr %%f2,%%f4\n\t" - "std %%f2,16(%[y])\n\t" - "vrepg %%v4,%%v3,1\n\t" - "adbr %%f3,%%f4\n\t" - "std %%f3,24(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); -} +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) + #include +#endif +#define NBMAX 2048 -static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmadb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmadb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmadb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmadb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmadb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmadb %%v3,%%v17,%%v27,%%v3\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v1,%%v1,%%v3\n\t" - "vfadb %%v1,%%v1,%%v5\n\t" - "vfadb %%v1,%%v1,%%v7\n\t" - "vrepg %%v2,%%v0,1\n\t" - "adbr %%f0,%%f2\n\t" - "std %%f0,0(%[y])\n\t" - "vrepg %%v2,%%v1,1\n\t" - "adbr %%f1,%%f2\n\t" - "std %%f1,8(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); +#ifdef HAVE_KERNEL_4x4 + +#elif HAVE_KERNEL_4x4_VEC + +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* va2 = (__vector double*)ap[2]; + __vector double* va3 = (__vector double*)ap[3]; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; + __vector double temp1 = {0,0}; + __vector double temp2 = {0,0}; + __vector double temp3 = {0,0}; + + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; + temp2 += v_x[i] * va2[i] + v_x[i+1] * va2[i+1] ; + temp3 += v_x[i] * va3[i] + v_x[i+1] * va3[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; + y[1] = temp1[0] + temp1[1]; + y[2] = temp2[0] + temp2[1]; + y[3] = temp3[0] + temp3[1];; } - -static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "std %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); +#else +static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + FLOAT *a0,*a1,*a2,*a3; + a0 = ap[0]; + a1 = ap[1]; + a2 = ap[2]; + a3 = ap[3]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + FLOAT temp2 = 0.0; + FLOAT temp3 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; + temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; + temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; + } + y[0] = temp0; + y[1] = temp1; + y[2] = temp2; + y[3] = temp3; } - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - dest[i] = *src; - src += inc_src; - } + +#endif + +#ifdef HAVE_KERNEL_4x2 + +#elif HAVE_KERNEL_4x2_VEC + +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)ap[0]; + __vector double* va1 = (__vector double*)ap[1]; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; + __vector double temp1 = {0,0}; + + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + temp1 += v_x[i] * va1[i] + v_x[i+1] * va1[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; + y[1] = temp1[0] + temp1[1]; } +#else +static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) +{ -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { - __asm__("vlrepg %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-16\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,4\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmadb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmadb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmadb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmadb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmadb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmadb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,12\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmadb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "agfi %%r1,32\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, - BLASLONG inc_dest) { - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else { BLASLONG i; - for (i = 0; i < n; i++) { - *dest += src[i] * da; - dest += inc_dest; + FLOAT *a0,*a1; + a0 = ap[0]; + a1 = ap[1]; + FLOAT temp0 = 0.0; + FLOAT temp1 = 0.0; + + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; } - } + y[0] = temp0; + y[1] = temp1; + } +#endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, - BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *buffer) { - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m & -4; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } +#ifdef HAVE_KERNEL_4x1 - y_ptr = y; - a_ptr = a; - x_ptr = x; +#elif HAVE_KERNEL_4x1_VEC - if (inc_x == 1) - xbuffer = x_ptr; - else - copy_x(NB, x_ptr, xbuffer, inc_x); +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + __vector double* va0 = (__vector double*)a0; + __vector double* v_x =(__vector double*)x; + __vector double temp0 = {0,0}; - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; + for ( i=0; i< n/2; i+=2 ) + { + temp0 += v_x[i] * va0[i] + v_x[i+1] * va0[i+1] ; + } + + y[0] = temp0[0] + temp0[1]; +} +#else +static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) +{ + BLASLONG i; + + + FLOAT temp0 = 0.0; - if (n0 > 0) { - BLASLONG nb1 = NBMAX / 4; - for (j = 0; j < n0; j++) { + for ( i=0; i< n; i+=4 ) + { + temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; + } + y[0] = temp0; +} +#endif + +static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) +{ + BLASLONG i; + for ( i=0; i> 2 ; + n2 = n & 3 ; + + m3 = m & 3 ; + m1 = m & -4 ; + m2 = (m & (NBMAX-1)) - m3 ; + + + BLASLONG NB = NBMAX; + + while ( NB == NBMAX ) + { + + m1 -= NB; + if ( m1 < 0) + { + if ( m2 == 0 ) break; + NB = m2; } - add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); - y_ptr += nb1 * inc_y * 4; - a_ptr += nb1 * lda4; + + y_ptr = y; + a_ptr = a; + x_ptr = x; + + if ( inc_x == 1 ) + xbuffer = x_ptr; + else + copy_x(NB,x_ptr,xbuffer,inc_x); + + + FLOAT *ap[4]; + FLOAT *yp; + BLASLONG register lda4 = 4 * lda; + ap[0] = a_ptr; + ap[1] = a_ptr + lda; + ap[2] = ap[1] + lda; + ap[3] = ap[2] + lda; + + if ( n0 > 0 ) + { + BLASLONG nb1 = NBMAX / 4; + for( j=0; j 0) { - add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4; - } + for( i = 0; i < n1 ; i++) + { + dgemv_kernel_4x4(NB,ap,xbuffer,yp); + ap[0] += lda4 ; + ap[1] += lda4 ; + ap[2] += lda4 ; + ap[3] += lda4 ; + yp += 4; + } + if ( n1 > 0 ) + { + add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); + y_ptr += n1 * inc_y * 4; + a_ptr += n1 * lda4 ; + } - if (n2 & 2) { + if ( n2 & 2 ) + { - dgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; + dgemv_kernel_4x2(NB,ap,xbuffer,ybuffer); + a_ptr += lda * 2; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + *y_ptr += ybuffer[1] * alpha; + y_ptr += inc_y; - } + } - if (n2 & 1) { + if ( n2 & 1 ) + { - dgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; + dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); + a_ptr += lda; + *y_ptr += ybuffer[0] * alpha; + y_ptr += inc_y; + } + a += NB; + x += NB * inc_x; } - a += NB; - x += NB * inc_x; - } - - if (m3 == 0) - return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; + if ( m3 == 0 ) return(0); - if (lda == 3 && inc_y == 1) { + x_ptr = x; + a_ptr = a; + if ( m3 == 3 ) + { + FLOAT xtemp0 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp1 = *x_ptr * alpha; + x_ptr += inc_x; + FLOAT xtemp2 = *x_ptr * alpha; + + FLOAT *aj = a_ptr; + y_ptr = y; + + if ( lda == 3 && inc_y == 1 ) + { + + for ( j=0; j< ( n & -4) ; j+=4 ) + { + + y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; + y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; + y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; + y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; + aj += 12; + } + + for ( ; j 0) { - - maxf = dmax_kernel_32(n1, x); - - i = n1; - } else { - maxf = x[0]; - i++; - } - - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); - - } else { - - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/dmax_z13.c b/kernel/zarch/dmax_z13.c deleted file mode 100644 index c4e8d91f8..000000000 --- a/kernel/zarch/dmax_z13.c +++ /dev/null @@ -1,164 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT dmax_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT max; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vfchdb %%v26,%%v20,%%v21\n\t" - "vfchdb %%v27,%%v22,%%v23\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v24,%%v25\n\t" - "vfchdb %%v29,%%v26,%%v27\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v28,%%v29\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v30,%%v0\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return max; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - maxf = dmax_kernel_32(n1, x); - - i = n1; - } else { - maxf = x[0]; - i++; - } - - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); - - } else { - - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/dmin.c b/kernel/zarch/dmin.c deleted file mode 100644 index f9b129cbd..000000000 --- a/kernel/zarch/dmin.c +++ /dev/null @@ -1,147 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT min; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v17,%%v17,%%v25,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v19,%%v19,%%v27,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v21,%%v21,%%v29,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v23,%%v23,%%v31,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v17,%%v17,%%v21,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v19,%%v19,%%v23,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v17,%%v17,%%v19,0\n\t" - "vfmindb %%v16,%%v16,%%v17,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return min; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = dmin_kernel_32(n1, x); - - i = n1; - } else { - minf = x[0]; - i++; - } - - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); - - } else { - - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/dmin_z13.c b/kernel/zarch/dmin_z13.c deleted file mode 100644 index 77f021c1d..000000000 --- a/kernel/zarch/dmin_z13.c +++ /dev/null @@ -1,164 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT dmin_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT min; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vfchdb %%v26,%%v21,%%v20\n\t" - "vfchdb %%v27,%%v23,%%v22\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vsel %%v26,%%v20,%%v21,%%v26\n\t" - "vsel %%v27,%%v22,%%v23,%%v27\n\t" - "vfchdb %%v28,%%v25,%%v24\n\t" - "vfchdb %%v29,%%v27,%%v26\n\t" - "vsel %%v28,%%v24,%%v25,%%v28\n\t" - "vsel %%v29,%%v26,%%v27,%%v29\n\t" - "vfchdb %%v30,%%v29,%%v28\n\t" - "vsel %%v30,%%v28,%%v29,%%v30\n\t" - "vfchdb %%v31,%%v0,%%v30\n\t" - "vsel %%v0,%%v30,%%v0,%%v31\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return min; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - minf = dmin_kernel_32(n1, x); - - i = n1; - } else { - minf = x[0]; - i++; - } - - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); - - } else { - - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/drot.c b/kernel/zarch/drot.c index 11fbe15b6..bf29538c7 100644 --- a/kernel/zarch/drot.c +++ b/kernel/zarch/drot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,200 +27,226 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), - [n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void drot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +{ + __asm__ ( + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "lgdr %%r1,%[cos] \n\t" + "vlvgp %%v0,%%r1,%%r1 \n\t" + "lgdr %%r1,%[sin] \n\t" + "vlvgp %%v1,%%r1,%%r1 \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 112(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) + : "cc", "r1" ,"v0","v1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT c, FLOAT s) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + FLOAT temp; - FLOAT temp; + if ( n <= 0 ) return(0); - if (n <= 0) - return (0); + if ( (inc_x == 1) && (inc_y == 1) ) + { - if ((inc_x == 1) && (inc_y == 1)) { + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + drot_kernel_32(n1, x, y, c, s); + i=n1; + } - BLASLONG n1 = n & -32; - if (n1 > 0) { - FLOAT cosa, sina; - cosa = c; - sina = s; - drot_kernel_32(n1, x, y, &cosa, &sina); - i = n1; - } + while(i < n) + { + temp = c*x[i] + s*y[i] ; + y[i] = c*y[i] - s*x[i] ; + x[i] = temp ; - while (i < n) { - temp = c * x[i] + s * y[i]; - y[i] = c * y[i] - s * x[i]; - x[i] = temp; + i++ ; - i++; + } } + else + { - } else { + while(i < n) + { + temp = c*x[ix] + s*y[iy] ; + y[iy] = c*y[iy] - s*x[ix] ; + x[ix] = temp ; - while (i < n) { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; - ix += inc_x; - iy += inc_y; - i++; + } } - - } - return (0); + return(0); } + diff --git a/kernel/zarch/dscal.c b/kernel/zarch/dscal.c index 2961eff20..e29f51012 100644 --- a/kernel/zarch/dscal.c +++ b/kernel/zarch/dscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,151 +27,237 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void dscal_kernel_16(BLASLONG n, FLOAT da, FLOAT *x) { - __asm__("vlrepg %%v0,%[da]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmdb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmdb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmdb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmdb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmdb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmdb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmdb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmdb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) - : [x] "a"(x),[da] "Q"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} +#ifdef Z13_A +static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +{ + + + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v0,%%r0,%%r0 \n\t" + "srlg %[n],%[n],4 \n\t" + "vlr %%v1,%%v0 \n\t" + "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "la %[x_ptr], 128(%[x_ptr]) \n\t" + "aghik %[n], %[n], -1 \n\t" + "jle 2f \n\t" + ".align 16 \n\t" + "1: \n\t" + "vfmdb %%v24, %%v16, %%v0 \n\t" + "vfmdb %%v25, %%v17, %%v0 \n\t" + "vfmdb %%v26, %%v18, %%v0 \n\t" + "vfmdb %%v27, %%v19, %%v1 \n\t" + "vlm %%v16,%%v19, 0(%[x_ptr]) \n\t" + "vfmdb %%v28, %%v20, %%v0 \n\t" + "vfmdb %%v29, %%v21, %%v1 \n\t" + "vfmdb %%v30, %%v22, %%v0 \n\t" + "vfmdb %%v31, %%v23, %%v1 \n\t" + "vlm %%v20,%%v23, 64(%[x_ptr]) \n\t" + "lay %[x_ptr], -128(%[x_ptr]) \n\t" + "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" + "la %[x_ptr],256(%[x_ptr]) \n\t" + "brctg %[n],1b \n\t" + "2: \n\t" + "vfmdb %%v24, %%v16, %%v0 \n\t" + "vfmdb %%v25, %%v17, %%v1 \n\t" + "vfmdb %%v26, %%v18, %%v0 \n\t" + "vfmdb %%v27, %%v19, %%v1 \n\t" + "lay %[x_ptr] , -128(%[x_ptr]) \n\t" + "vfmdb %%v28, %%v20, %%v0 \n\t" + "vfmdb %%v29, %%v21, %%v1 \n\t" + "vfmdb %%v30, %%v22, %%v0 \n\t" + "vfmdb %%v31, %%v23, %%v1 \n\t" + "vstm %%v24,%%v31, 0(%[x_ptr]) \n\t" + : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x),[n] "+&r"(n) + : [alpha] "f"(da) + :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", + "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + } +#else +static void dscal_kernel_32( BLASLONG n, FLOAT da , FLOAT *x ) +{ -static void dscal_kernel_16_zero(BLASLONG n, FLOAT *x) { - __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); + /* faster than sequence of triples(vl vfmd vst) (tested OPENBLAS_LOOPS=10000) */ + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v0,%%r0,%%r0 \n\t" + "vlr %%v1,%%v0 \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%[x_ptr]) \n\t" + "vlm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "vfmdb %%v16,%%v16,%%v0 \n\t" + "vfmdb %%v17,%%v17,%%v1 \n\t" + "vfmdb %%v18,%%v18,%%v0 \n\t" + "vfmdb %%v19,%%v19,%%v1 \n\t" + "vfmdb %%v20,%%v20,%%v0 \n\t" + "vfmdb %%v21,%%v21,%%v1 \n\t" + "vfmdb %%v22,%%v22,%%v0 \n\t" + "vfmdb %%v23,%%v23,%%v1 \n\t" + "vstm %%v16,%%v23, 0(%[x_ptr]) \n\t" + "vlm %%v24,%%v31,128(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v0 \n\t" + "vfmdb %%v25,%%v25,%%v1 \n\t" + "vfmdb %%v26,%%v26,%%v0 \n\t" + "vfmdb %%v27,%%v27,%%v1 \n\t" + "vfmdb %%v28,%%v28,%%v0 \n\t" + "vfmdb %%v29,%%v29,%%v1 \n\t" + "vfmdb %%v30,%%v30,%%v0 \n\t" + "vfmdb %%v31,%%v31,%%v1 \n\t" + "vstm %%v24,%%v31,128(%[x_ptr]) \n\t" + "la %[x_ptr], 256(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da) + :"cc" , "r0","v0","v1","v16","v17","v18","v19","v20","v21", + "v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + } +#endif +static void dscal_kernel_32_zero( BLASLONG n, FLOAT *x ) +{ + + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "vzero %%v24 \n\t" + "sllg %%r0,%[n],3 \n\t" + "vzero %%v25 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%[x_ptr]) \n\t" + "vst %%v24, 0(%[x_ptr]) \n\t" + "vst %%v25, 16(%[x_ptr]) \n\t" + "vst %%v24, 32(%[x_ptr]) \n\t" + "vst %%v25, 48(%[x_ptr]) \n\t" + "vst %%v24, 64(%[x_ptr]) \n\t" + "vst %%v25, 80(%[x_ptr]) \n\t" + "vst %%v24, 96(%[x_ptr]) \n\t" + "vst %%v25, 112(%[x_ptr]) \n\t" + "vst %%v24, 128(%[x_ptr]) \n\t" + "vst %%v25, 144(%[x_ptr]) \n\t" + "vst %%v24, 160(%[x_ptr]) \n\t" + "vst %%v25, 176(%[x_ptr]) \n\t" + "vst %%v24, 192(%[x_ptr]) \n\t" + "vst %%v25, 208(%[x_ptr]) \n\t" + "vst %%v24, 224(%[x_ptr]) \n\t" + "vst %%v25, 240(%[x_ptr]) \n\t" + "la %[x_ptr],256(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "=m" (*(double (*)[n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n) + :"cc" , "r0", "v24" ,"v25" + ); } + + -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, - BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - if (n <= 0 || inc_x <= 0) - return (0); - if (inc_x == 1) { +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0,j=0; + if ( n <= 0 || inc_x <=0 ) + return(0); - if (da == 0.0) { + + if ( inc_x == 1 ) + { - BLASLONG n1 = n & -16; - if (n1 > 0) { + if ( da == 0.0 ) + { - dscal_kernel_16_zero(n1, x); - j = n1; - } + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + + dscal_kernel_32_zero(n1 , x); + j=n1; + } - while (j < n) { + while(j < n) + { - x[j] = 0.0; - j++; - } + x[j]=0.0; + j++; + } - } else { + } + else + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + dscal_kernel_32(n1 , da , x); + j=n1; + } + while(j < n) + { + + x[j] = da * x[j] ; + j++; + } + } - BLASLONG n1 = n & -16; - if (n1 > 0) { - dscal_kernel_16(n1, da, x); - j = n1; - } - while (j < n) { - x[j] = da * x[j]; - j++; - } } + else + { - } else { + if ( da == 0.0 ) + { - if (da == 0.0) { + BLASLONG n1 = n & -4; - BLASLONG n1 = n & -4; + while (j < n1) { - while (j < n1) { + x[i]=0.0; + x[i + inc_x]=0.0; + x[i + 2 * inc_x]=0.0; + x[i + 3 * inc_x]=0.0; - x[i] = 0.0; - x[i + inc_x] = 0.0; - x[i + 2 * inc_x] = 0.0; - x[i + 3 * inc_x] = 0.0; + i += inc_x * 4; + j += 4; - i += inc_x * 4; - j += 4; + } + while(j < n) + { - } - while (j < n) { + x[i]=0.0; + i += inc_x ; + j++; + } - x[i] = 0.0; - i += inc_x; - j++; - } + } + else + { + BLASLONG n1 = n & -4; - } else { - BLASLONG n1 = n & -4; + while (j < n1) { - while (j < n1) { + x[i] = da * x[i] ; + x[i + inc_x] = da * x[i + inc_x]; + x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; + x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; - x[i] = da * x[i]; - x[i + inc_x] = da * x[i + inc_x]; - x[i + 2 * inc_x] = da * x[i + 2 * inc_x]; - x[i + 3 * inc_x] = da * x[i + 3 * inc_x]; + i += inc_x * 4; + j += 4; - i += inc_x * 4; - j += 4; + } - } + while(j < n) + { - while (j < n) { + x[i] = da * x[i] ; + i += inc_x ; + j++; + } + } - x[i] = da * x[i]; - i += inc_x; - j++; - } } + return 0; - } - return 0; - -} +} \ No newline at end of file diff --git a/kernel/zarch/dsdot.c b/kernel/zarch/dsdot.c deleted file mode 100644 index 5fa88c3b9..000000000 --- a/kernel/zarch/dsdot.c +++ /dev/null @@ -1,173 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019,The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms,with or without -modification,are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice,this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice,this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL -DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static double dsdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - double dot; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vlef %%v16,0(%%r1,%[x]),0\n\t" - "vlef %%v16,4(%%r1,%[x]),2\n\t" - "vlef %%v17,8(%%r1,%[x]),0\n\t" - "vlef %%v17,12(%%r1,%[x]),2\n\t" - "vlef %%v18,16(%%r1,%[x]),0\n\t" - "vlef %%v18,20(%%r1,%[x]),2\n\t" - "vlef %%v19,24(%%r1,%[x]),0\n\t" - "vlef %%v19,28(%%r1,%[x]),2\n\t" - "vlef %%v20,32(%%r1,%[x]),0\n\t" - "vlef %%v20,36(%%r1,%[x]),2\n\t" - "vlef %%v21,40(%%r1,%[x]),0\n\t" - "vlef %%v21,44(%%r1,%[x]),2\n\t" - "vlef %%v22,48(%%r1,%[x]),0\n\t" - "vlef %%v22,52(%%r1,%[x]),2\n\t" - "vlef %%v23,56(%%r1,%[x]),0\n\t" - "vlef %%v23,60(%%r1,%[x]),2\n\t" - "vflls %%v16,%%v16\n\t" - "vflls %%v17,%%v17\n\t" - "vflls %%v18,%%v18\n\t" - "vflls %%v19,%%v19\n\t" - "vflls %%v20,%%v20\n\t" - "vflls %%v21,%%v21\n\t" - "vflls %%v22,%%v22\n\t" - "vflls %%v23,%%v23\n\t" - "vlef %%v24,0(%%r1,%[y]),0\n\t" - "vlef %%v24,4(%%r1,%[y]),2\n\t" - "vflls %%v24,%%v24\n\t" - "vfmadb %%v0,%%v16,%%v24,%%v0\n\t" - "vlef %%v25,8(%%r1,%[y]),0\n\t" - "vlef %%v25,12(%%r1,%[y]),2\n\t" - "vflls %%v25,%%v25\n\t" - "vfmadb %%v1,%%v17,%%v25,%%v1\n\t" - "vlef %%v26,16(%%r1,%[y]),0\n\t" - "vlef %%v26,20(%%r1,%[y]),2\n\t" - "vflls %%v26,%%v26\n\t" - "vfmadb %%v2,%%v18,%%v26,%%v2\n\t" - "vlef %%v27,24(%%r1,%[y]),0\n\t" - "vlef %%v27,28(%%r1,%[y]),2\n\t" - "vflls %%v27,%%v27\n\t" - "vfmadb %%v3,%%v19,%%v27,%%v3\n\t" - "vlef %%v28,32(%%r1,%[y]),0\n\t" - "vlef %%v28,36(%%r1,%[y]),2\n\t" - "vflls %%v28,%%v28\n\t" - "vfmadb %%v4,%%v20,%%v28,%%v4\n\t" - "vlef %%v29,40(%%r1,%[y]),0\n\t" - "vlef %%v29,44(%%r1,%[y]),2\n\t" - "vflls %%v29,%%v29\n\t" - "vfmadb %%v5,%%v21,%%v29,%%v5\n\t" - "vlef %%v30,48(%%r1,%[y]),0\n\t" - "vlef %%v30,52(%%r1,%[y]),2\n\t" - "vflls %%v30,%%v30\n\t" - "vfmadb %%v6,%%v22,%%v30,%%v6\n\t" - "vlef %%v31,56(%%r1,%[y]),0\n\t" - "vlef %%v31,60(%%r1,%[y]),2\n\t" - "vflls %%v31,%%v31\n\t" - "vfmadb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vfadb %%v0,%%v0,%%v2\n\t" - "vfadb %%v0,%%v0,%%v3\n\t" - "vfadb %%v0,%%v0,%%v4\n\t" - "vfadb %%v0,%%v0,%%v5\n\t" - "vfadb %%v0,%%v0,%%v6\n\t" - "vfadb %%v0,%%v0,%%v7\n\t" - "vrepg %%v1,%%v0,1\n\t" - "adbr %%f0,%%f1\n\t" - "ldr %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return dot; -} - -double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - double dot = 0.0; - - if (n <= 0) - return (dot); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -16; - - if (n1) - dot = dsdot_kernel_16(n1, x, y); - - i = n1; - while (i < n) { - - dot += (double) y[i] * (double) x[i]; - i++; - - } - return (dot); - - } - - BLASLONG n1 = n & -2; - - while (i < n1) { - - dot += (double) y[iy] * (double) x[ix]; - dot += (double) y[iy + inc_y] * (double) x[ix + inc_x]; - ix += inc_x * 2; - iy += inc_y * 2; - i += 2; - - } - - while (i < n) { - - dot += (double) y[iy] * (double) x[ix]; - ix += inc_x; - iy += inc_y; - i++; - - } - return (dot); - -} diff --git a/kernel/zarch/dsum.c b/kernel/zarch/dsum.c deleted file mode 100644 index 178bc3462..000000000 --- a/kernel/zarch/dsum.c +++ /dev/null @@ -1,148 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -static FLOAT dsum_kernel_32(BLASLONG n, FLOAT *x) { - FLOAT sum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return sum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) - return sumf; - - if (inc_x == 1) { - - n1 = n & -32; - - if (n1 > 0) { - - sumf = dsum_kernel_32(n1, x); - i = n1; - } - - while (i < n) { - sumf += x[i]; - i++; - } - - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { - - sum1 += x[i]; - sum2 += x[i + inc_x]; - sum1 += x[i + 2 * inc_x]; - sum2 += x[i + 3 * inc_x]; - - i += inc_x * 4; - j += 4; - - } - sumf = sum1 + sum2; - while (j < n) { - - sumf += x[i]; - i += inc_x; - j++; - } - - } - return sumf; -} diff --git a/kernel/zarch/dswap.c b/kernel/zarch/dswap.c index f0c9ded51..d7e079147 100644 --- a/kernel/zarch/dswap.c +++ b/kernel/zarch/dswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,127 +25,264 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + + #include "common.h" -static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), - [n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + + +#if defined(Z13_SWAP_A) +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" + + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" + + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" + + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" + + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" + + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" + + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" + ,"v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, - BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; +#else - if (n <= 0) - return (0); +static void dswap_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],5 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - if ((inc_x == 1) && (inc_y == 1)) { - BLASLONG n1 = n & -32; - if (n1 > 0) { - dswap_kernel_32(n1, x, y); - i = n1; - } + "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - while (i < n) { - temp = y[i]; - y[i] = x[i]; - x[i] = temp; - i++; + "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" - } + "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[n])x), + [mem_y] "+m" (*(double (*)[n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n <= 0 ) return(0); - } else { + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -32; + if ( n1 > 0 ) + { + dswap_kernel_32(n1, x, y); + i=n1; + } + + while(i < n) + { + temp = y[i]; + y[i] = x[i] ; + x[i] = temp; + i++ ; + + } - while (i < n) { - temp = y[iy]; - y[iy] = x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; } + else + { + + while(i < n) + { + temp = y[iy]; + y[iy] = x[ix] ; + x[ix] = temp; + ix += inc_x ; + iy += inc_y ; + i++ ; - } - return (0); + } + + } + return(0); + } + + diff --git a/kernel/zarch/icamax.c b/kernel/zarch/icamax.c deleted file mode 100644 index a2546b812..000000000 --- a/kernel/zarch/icamax.c +++ /dev/null @@ -1,302 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - -static BLASLONG icamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { - BLASLONG iamax; - - __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return iamax; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = icamax_kernel_32(n1, x, &maxf); - ix = n1 * 2; - i = n1; - } else { - maxf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (max + 1); - - } else { - - max = 0; - maxf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) > maxf) { - max = i + 1; - maxf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + 2 * inc_x2) > maxf) { - max = i + 2; - maxf = CABS1(x, ix + 2 * inc_x2); - } - if (CABS1(x, ix + 3 * inc_x2) > maxf) { - max = i + 3; - maxf = CABS1(x, ix + 3 * inc_x2); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (max + 1); - } -} diff --git a/kernel/zarch/icamin.c b/kernel/zarch/icamin.c deleted file mode 100644 index 09654b742..000000000 --- a/kernel/zarch/icamin.c +++ /dev/null @@ -1,302 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabsf(x[i]) + fabsf(x[i + 1])) - -static BLASLONG icamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { - BLASLONG iamin; - - __asm__("vlef %%v0,0(%[x]),0\n\t" - "vlef %%v1,4(%[x]),0\n\t" - "vlef %%v0,8(%[x]),1\n\t" - "vlef %%v1,12(%[x]),1\n\t" - "vlef %%v0,16(%[x]),2\n\t" - "vlef %%v1,20(%[x]),2\n\t" - "vlef %%v0,24(%[x]),3\n\t" - "vlef %%v1,28(%[x]),3\n\t" - "vflpsb %%v0,%%v0\n\t" - "vflpsb %%v1,%%v1\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,16\n\t" - "vzero %%v4\n\t" - "vleib %%v9,0,0\n\t" - "vleib %%v9,1,1\n\t" - "vleib %%v9,2,2\n\t" - "vleib %%v9,3,3\n\t" - "vleib %%v9,8,4\n\t" - "vleib %%v9,9,5\n\t" - "vleib %%v9,10,6\n\t" - "vleib %%v9,11,7\n\t" - "vleib %%v9,16,8\n\t" - "vleib %%v9,17,9\n\t" - "vleib %%v9,18,10\n\t" - "vleib %%v9,19,11\n\t" - "vleib %%v9,24,12\n\t" - "vleib %%v9,25,13\n\t" - "vleib %%v9,26,14\n\t" - "vleib %%v9,27,15\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v28,16(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v29,48(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v30,80(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v28,144(%%r1,%[x])\n\t" - "vpkg %%v17,%%v16,%%v28\n\t" - "vperm %%v16,%%v16,%%v28,%%v9\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v29,176(%%r1,%[x])\n\t" - "vpkg %%v19,%%v18,%%v29\n\t" - "vperm %%v18,%%v18,%%v29,%%v9\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v30,208(%%r1,%[x])\n\t" - "vpkg %%v21,%%v20,%%v30\n\t" - "vperm %%v20,%%v20,%%v30,%%v9\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vpkg %%v23,%%v22,%%v31\n\t" - "vperm %%v22,%%v22,%%v31,%%v9\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v16,%%v16,%%v17\n\t" - "vfasb %%v17,%%v18,%%v19\n\t" - "vfasb %%v18,%%v20,%%v21\n\t" - "vfasb %%v19,%%v22,%%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", - "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", - "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return iamin; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (min); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - min = icamin_kernel_32(n1, x, &minf); - ix = n1 * 2; - i = n1; - } else { - minf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - min = 0; - minf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) < minf) { - min = i + 1; - minf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + 2 * inc_x2) < minf) { - min = i + 2; - minf = CABS1(x, ix + 2 * inc_x2); - } - if (CABS1(x, ix + 3 * inc_x2) < minf) { - min = i + 3; - minf = CABS1(x, ix + 3 * inc_x2); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (min + 1); - } -} diff --git a/kernel/zarch/idamax.c b/kernel/zarch/idamax.c index b292c1d15..b67091148 100644 --- a/kernel/zarch/idamax.c +++ b/kernel/zarch/idamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,223 +23,225 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - + *****************************************************************************/ #include "common.h" #include -#define ABS fabs - -static BLASLONG idamax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amax) { - BLASLONG iamax; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return iamax; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) - return (max); +#if defined(DOUBLE) - if (inc_x == 1) { +#define ABS fabs - BLASLONG n1 = n & -32; - if (n1 > 0) { +#else + +#define ABS fabsf + +#endif + + +/** + * Find maximum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG diamax_kernel_32_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + __asm__( + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vleig %%v20,0,0 \n\t" + "vleig %%v20,1,1 \n\t" + "vleig %%v21,2,0 \n\t" + "vleig %%v21,3,1 \n\t" + "vleig %%v22,4,0 \n\t" + "vleig %%v22,5,1 \n\t" + "vleig %%v23,6,0 \n\t" + "vleig %%v23,7,1 \n\t" + "vrepig %%v4,8 \n\t" + "vzero %%v5 \n\t" + "vzero %%v18 \n\t" + "vzero %%v19 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "vfchdb %%v16,%%v25,%%v24 \n\t " + "vfchdb %%v17,%%v27,%%v26 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v29,%%v28 \n\t " + "vfchdb %%v17,%%v31,%%v30 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + "vfchdb %%v28, %%v3,%%v0 \n\t" + "vfchdb %%v29,%%v27, %%v25 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "vag %%v24,%%v24,%%v4 \n\t" + "vfchdb %%v16,%%v25 , %%v0 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vfchdb %%v17, %%v29,%%v18 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "vfchdb %%v16,%%v25,%%v24 \n\t " + "vfchdb %%v17,%%v27,%%v26 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v29,%%v28 \n\t " + "vfchdb %%v17,%%v31,%%v30 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + "vfchdb %%v28, %%v3,%%v0 \n\t" + "vfchdb %%v29,%%v27, %%v25 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + "vag %%v24,%%v24,%%v4 \n\t" + "vfchdb %%v16,%%v25 , %%v0 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + "vfchdb %%v17, %%v29,%%v18 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "clgrjl %[ptr_tmp],%%r0,1b \n\t" + + "vrepg %%v26,%%v18,1 \n\t" + "vrepg %%v5,%%v19,1 \n\t" + "wfcdb %%v26,%%v18 \n\t" + "jne 2f \n\t" + "vsteg %%v18,%[maxf],0 \n\t" + "vmnlg %%v1,%%v5,%%v19 \n\t" + "j 3f \n\t" + + "2: \n\t" + "wfchdb %%v16,%%v26,%%v18 \n\t" + "vsel %%v1,%%v5,%%v19,%%v16 \n\t" + "vsel %%v0,%%v26,%%v18,%%v16 \n\t" + "std %%f0,%[maxf] \n\t" + + "3: \n\t" + "vlgvg %[index],%%v1,0 \n\t" + : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) + : "cc", "r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + return index; - max = idamax_kernel_32(n1, x, &maxf); +} - i = n1; - } else { - maxf = ABS(x[0]); - i++; - } - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + BLASLONG ix = 0; + FLOAT maxf = 0.0; + BLASLONG max = 0; - } else { + if (n <= 0 || inc_x <= 0) return (max); - max = 0; - maxf = ABS(x[0]); + if (inc_x == 1) { - BLASLONG n1 = n & -4; - while (j < n1) { + BLASLONG n1 = n & -32; + if (n1 > 0) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } + max = diamax_kernel_32_TUNED(n1, x, &maxf); - i += inc_x * 4; + i = n1; + } - j += 4; + while (i < n) { + if (ABS(x[i]) > maxf) { + max = i; + maxf = ABS(x[i]); + } + i++; + } + return (max + 1); - } + } else { - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) > maxf) { + max = j + 1; + maxf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) > maxf) { + max = j + 2; + maxf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) > maxf) { + max = j + 3; + maxf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } + + + while (j < n) { + if (ABS(x[i]) > maxf) { + max = j; + maxf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (max + 1); } - return (max + 1); - } } diff --git a/kernel/zarch/idamin.c b/kernel/zarch/idamin.c index f9a8119e1..8a7ff1659 100644 --- a/kernel/zarch/idamin.c +++ b/kernel/zarch/idamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,223 +23,241 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - + *****************************************************************************/ #include "common.h" #include +#if defined(DOUBLE) + #define ABS fabs -static BLASLONG idamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *amin) { - BLASLONG iamin; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpdb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return iamin; -} +#else -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; +#define ABS fabsf - if (n <= 0 || inc_x <= 0) - return (min); +#endif - if (inc_x == 1) { +/** + * Find minimum index + * Warning: requirements n>0 and n % 32 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return minimum index + */ +static BLASLONG diamin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index; + __asm__( + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],3 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vleig %%v20,0,0 \n\t" + "vleig %%v20,1,1 \n\t" + "vleig %%v21,2,0 \n\t" + "vleig %%v21,3,1 \n\t" + "vleig %%v22,4,0 \n\t" + "vleig %%v22,5,1 \n\t" + "vleig %%v23,6,0 \n\t" + "vleig %%v23,7,1 \n\t" + "vrepig %%v4,8 \n\t" + "vlrepg %%v18,0(%[ptr_x]) \n\t" + "vzero %%v5 \n\t" + "vflpdb %%v18, %%v18 \n\t" + "vzero %%v19 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31, 0(%[ptr_tmp] ) \n\t" - BLASLONG n1 = n & -32; - if (n1 > 0) { + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" - min = idamin_kernel_32(n1, x, &minf); + "vfchdb %%v16,%%v24,%%v25 \n\t " + "vfchdb %%v17,%%v26 ,%%v27 \n\t " + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v28, %%v29 \n\t " + "vfchdb %%v17,%%v30,%%v31 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" - i = n1; - } else { - minf = ABS(x[0]); - i++; - } - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); + "vfchdb %%v28,%%v0 , %%v3 \n\t" + "vfchdb %%v29, %%v25,%%v27 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" - } else { + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "vag %%v24,%%v24,%%v4 \n\t" - min = 0; - minf = ABS(x[0]); + "vfchdb %%v16, %%v0,%%v25 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" - BLASLONG n1 = n & -4; - while (j < n1) { + "vfchdb %%v17,%%v18, %%v29 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } + "vag %%v5,%%v5,%%v4 \n\t" - i += inc_x * 4; + "vlm %%v24,%%v31,128(%[ptr_tmp] ) \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" - j += 4; + "vfchdb %%v16,%%v24,%%v25 \n\t" + "vfchdb %%v17,%%v26 ,%%v27 \n\t" + "vsel %%v1,%%v21,%%v20,%%v16 \n\t" + "vsel %%v0,%%v25,%%v24,%%v16 \n\t" + "vsel %%v2,%%v23,%%v22,%%v17 \n\t" + "vsel %%v3,%%v27,%%v26,%%v17 \n\t" + "vfchdb %%v16,%%v28 ,%%v29 \n\t" + "vfchdb %%v17,%%v30,%%v31 \n\t" + "vsel %%v24,%%v21,%%v20,%%v16 \n\t" + "vsel %%v25,%%v29,%%v28,%%v16 \n\t" + "vsel %%v26,%%v23,%%v22,%%v17 \n\t" + "vsel %%v27,%%v31,%%v30,%%v17 \n\t" + + + "vfchdb %%v28,%%v0 , %%v3 \n\t" + "vfchdb %%v29, %%v25,%%v27 \n\t" + "vsel %%v1,%%v2,%%v1,%%v28 \n\t" + "vsel %%v0,%%v3,%%v0,%%v28 \n\t" + "vsel %%v24,%%v26,%%v24,%%v29 \n\t" + "vsel %%v25,%%v27,%%v25,%%v29 \n\t" + + "vag %%v1,%%v1,%%v5 \n\t" + "vag %%v24,%%v24,%%v5 \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + "vag %%v24,%%v24,%%v4 \n\t" + + "vfchdb %%v16, %%v0,%%v25 \n\t" + "vag %%v5,%%v5,%%v4 \n\t" + "vsel %%v29,%%v25,%%v0,%%v16 \n\t" + "vsel %%v28,%%v24,%%v1,%%v16 \n\t" + + "vfchdb %%v17,%%v18, %%v29 \n\t" + "vsel %%v19,%%v28,%%v19,%%v17 \n\t" + "vsel %%v18,%%v29,%%v18,%%v17 \n\t" + + "vag %%v5,%%v5,%%v4 \n\t" + + "clgrjl %[ptr_tmp],%%r0,1b \n\t" + + + "vrepg %%v26,%%v18,1 \n\t" + "vrepg %%v5,%%v19,1 \n\t" + "wfcdb %%v26,%%v18 \n\t" + "jne 2f \n\t" + "vsteg %%v18,%[minf],0 \n\t" + "vmnlg %%v1,%%v5,%%v19 \n\t" + "j 3f \n\t" + + "2: \n\t" + "wfchdb %%v16,%%v18 ,%%v26 \n\t " + "vsel %%v1,%%v5,%%v19,%%v16 \n\t" + "vsel %%v0,%%v26,%%v18,%%v16 \n\t" + "std %%f0,%[minf] \n\t" + + "3: \n\t" + "vlgvg %[index],%%v1,0 \n\t" + + : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[n])x), [n] "r"(n), [ptr_x] "r"(x) + : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + + return index; + +} + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { + BLASLONG i = 0; + BLASLONG j = 0; + BLASLONG ix = 0; + BLASLONG min = 0; + FLOAT minf = 0.0; + + if (n <= 0 || inc_x <= 0) return (min); + minf = ABS(x[0]); //index's not incremented,though it will make first comparision redundant + if (inc_x == 1) { + + BLASLONG n1 = n & -32; + if (n1 > 0) { + + min = diamin_kernel_32(n1, x, &minf); + i = n1; + } + + while (i < n) { + if (ABS(x[i]) < minf) { + min = i; + minf = ABS(x[i]); + } + i++; + } + return (min + 1); + + } else { + + BLASLONG n1 = n & -4; + while (j < n1) { + + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + if (ABS(x[i + inc_x]) < minf) { + min = j + 1; + minf = ABS(x[i + inc_x]); + } + if (ABS(x[i + 2 * inc_x]) < minf) { + min = j + 2; + minf = ABS(x[i + 2 * inc_x]); + } + if (ABS(x[i + 3 * inc_x]) < minf) { + min = j + 3; + minf = ABS(x[i + 3 * inc_x]); + } + + i += inc_x * 4; + + j += 4; + + } - } - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; + while (j < n) { + if (ABS(x[i]) < minf) { + min = j; + minf = ABS(x[i]); + } + i += inc_x; + j++; + } + return (min + 1); } - return (min + 1); - } } diff --git a/kernel/zarch/idmax.c b/kernel/zarch/idmax.c deleted file mode 100644 index 8f283bc17..000000000 --- a/kernel/zarch/idmax.c +++ /dev/null @@ -1,225 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static BLASLONG idmax_kernel_32(BLASLONG n, FLOAT *x, FLOAT *max) { - BLASLONG imax; - - __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vfchedb %%v6,%%v20,%%v21\n\t" - "vfchedb %%v7,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v16,%%v17\n\t" - "vfchedb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return imax; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) - return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - max = idmax_kernel_32(n1, x, &maxf); - - i = n1; - } else { - maxf = x[0]; - i++; - } - - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); - - } else { - - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); - } -} diff --git a/kernel/zarch/idmin.c b/kernel/zarch/idmin.c deleted file mode 100644 index e4b7bb4fe..000000000 --- a/kernel/zarch/idmin.c +++ /dev/null @@ -1,225 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static BLASLONG idmin_kernel_32(BLASLONG n, FLOAT *x, FLOAT *min) { - BLASLONG imin; - - __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,16\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "vleig %%v28,8,0\n\t" - "vleig %%v28,9,1\n\t" - "vleig %%v29,10,0\n\t" - "vleig %%v29,11,1\n\t" - "vleig %%v30,12,0\n\t" - "vleig %%v30,13,1\n\t" - "vleig %%v31,14,0\n\t" - "vleig %%v31,15,1\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vfchedb %%v6,%%v21,%%v20\n\t" - "vfchedb %%v7,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vsel %%v18,%%v20,%%v21,%%v6\n\t" - "vsel %%v6,%%v28,%%v29,%%v6\n\t" - "vsel %%v19,%%v22,%%v23,%%v7\n\t" - "vsel %%v7,%%v30,%%v31,%%v7\n\t" - "vfchedb %%v20,%%v17,%%v16\n\t" - "vfchedb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v4,%%v4,%%v5,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v5,%%v6,%%v7,%%v21\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return imin; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) - return (min); - - if (inc_x == 1) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - min = idmin_kernel_32(n1, x, &minf); - - i = n1; - } else { - minf = x[0]; - i++; - } - - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); - - } else { - - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); - } -} diff --git a/kernel/zarch/isamax.c b/kernel/zarch/isamax.c deleted file mode 100644 index ac86435d7..000000000 --- a/kernel/zarch/isamax.c +++ /dev/null @@ -1,289 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static BLASLONG isamax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amax) { - BLASLONG iamax; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return iamax; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) - return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = isamax_kernel_64(n1, x, &maxf); - - i = n1; - } else { - maxf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - max = i; - maxf = ABS(x[i]); - } - i++; - } - return (max + 1); - - } else { - - max = 0; - maxf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - max = j + 1; - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - max = j + 2; - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - max = j + 3; - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) > maxf) { - max = j; - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (max + 1); - } -} diff --git a/kernel/zarch/isamin.c b/kernel/zarch/isamin.c deleted file mode 100644 index 3f2d039eb..000000000 --- a/kernel/zarch/isamin.c +++ /dev/null @@ -1,289 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static BLASLONG isamin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *amin) { - BLASLONG iamin; - - __asm__("vl %%v0,0(%[x])\n\t" - "vflpsb %%v0,%%v0\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return iamin; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) - return (min); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - min = isamin_kernel_64(n1, x, &minf); - - i = n1; - } else { - minf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - min = i; - minf = ABS(x[i]); - } - i++; - } - return (min + 1); - - } else { - - min = 0; - minf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - min = j + 1; - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - min = j + 2; - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - min = j + 3; - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) < minf) { - min = j; - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (min + 1); - } -} diff --git a/kernel/zarch/ismax.c b/kernel/zarch/ismax.c deleted file mode 100644 index 41172c1bd..000000000 --- a/kernel/zarch/ismax.c +++ /dev/null @@ -1,269 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static BLASLONG ismax_kernel_64(BLASLONG n, FLOAT *x, FLOAT *max) { - BLASLONG imax; - - __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v16,%%v17\n\t" - "vfchesb %%v6,%%v18,%%v19\n\t" - "vfchesb %%v7,%%v20,%%v21\n\t" - "vfchesb %%v8,%%v22,%%v23\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v16,%%v17\n\t" - "vfchesb %%v21,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v0,%%v3\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[max],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v2,%%v0\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[max]\n\t" - "vlgvg %[imax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imax] "=r"(imax),[max] "=Q"(*max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return imax; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - BLASLONG max = 0; - - if (n <= 0 || inc_x <= 0) - return (max); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - max = ismax_kernel_64(n1, x, &maxf); - - i = n1; - } else { - maxf = x[0]; - i++; - } - - while (i < n) { - if (x[i] > maxf) { - max = i; - maxf = x[i]; - } - i++; - } - return (max + 1); - - } else { - - max = 0; - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - max = j + 1; - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - max = j + 2; - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - max = j + 3; - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] > maxf) { - max = j; - maxf = x[i]; - } - i += inc_x; - j++; - } - return (max + 1); - } -} diff --git a/kernel/zarch/ismin.c b/kernel/zarch/ismin.c deleted file mode 100644 index e2684df41..000000000 --- a/kernel/zarch/ismin.c +++ /dev/null @@ -1,269 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static BLASLONG ismin_kernel_64(BLASLONG n, FLOAT *x, FLOAT *min) { - BLASLONG imin; - - __asm__("vl %%v0,0(%[x])\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,2,1\n\t" - "vleig %%v2,1,0\n\t" - "vleig %%v2,3,1\n\t" - "vrepig %%v3,32\n\t" - "vzero %%v4\n\t" - "vleif %%v24,0,0\n\t" - "vleif %%v24,1,1\n\t" - "vleif %%v24,2,2\n\t" - "vleif %%v24,3,3\n\t" - "vleif %%v25,4,0\n\t" - "vleif %%v25,5,1\n\t" - "vleif %%v25,6,2\n\t" - "vleif %%v25,7,3\n\t" - "vleif %%v26,8,0\n\t" - "vleif %%v26,9,1\n\t" - "vleif %%v26,10,2\n\t" - "vleif %%v26,11,3\n\t" - "vleif %%v27,12,0\n\t" - "vleif %%v27,13,1\n\t" - "vleif %%v27,14,2\n\t" - "vleif %%v27,15,3\n\t" - "vleif %%v28,16,0\n\t" - "vleif %%v28,17,1\n\t" - "vleif %%v28,18,2\n\t" - "vleif %%v28,19,3\n\t" - "vleif %%v29,20,0\n\t" - "vleif %%v29,21,1\n\t" - "vleif %%v29,22,2\n\t" - "vleif %%v29,23,3\n\t" - "vleif %%v30,24,0\n\t" - "vleif %%v30,25,1\n\t" - "vleif %%v30,26,2\n\t" - "vleif %%v30,27,3\n\t" - "vleif %%v31,28,0\n\t" - "vleif %%v31,29,1\n\t" - "vleif %%v31,30,2\n\t" - "vleif %%v31,31,3\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,192(%%r1,%[x])\n\t" - "vl %%v21,208(%%r1,%[x])\n\t" - "vl %%v22,224(%%r1,%[x])\n\t" - "vl %%v23,240(%%r1,%[x])\n\t" - "vfchesb %%v5,%%v17,%%v16\n\t" - "vfchesb %%v6,%%v19,%%v18\n\t" - "vfchesb %%v7,%%v21,%%v20\n\t" - "vfchesb %%v8,%%v23,%%v22\n\t" - "vsel %%v16,%%v16,%%v17,%%v5\n\t" - "vsel %%v5,%%v24,%%v25,%%v5\n\t" - "vsel %%v17,%%v18,%%v19,%%v6\n\t" - "vsel %%v6,%%v26,%%v27,%%v6\n\t" - "vsel %%v18,%%v20,%%v21,%%v7\n\t" - "vsel %%v7,%%v28,%%v29,%%v7\n\t" - "vsel %%v19,%%v22,%%v23,%%v8\n\t" - "vsel %%v8,%%v30,%%v31,%%v8\n\t" - "vfchesb %%v20,%%v17,%%v16\n\t" - "vfchesb %%v21,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v20\n\t" - "vsel %%v5,%%v5,%%v6,%%v20\n\t" - "vsel %%v17,%%v18,%%v19,%%v21\n\t" - "vsel %%v6,%%v7,%%v8,%%v21\n\t" - "vfchesb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v5,%%v5,%%v6,%%v18\n\t" - "vsegf %%v6,%%v5\n\t" - "vesrlg %%v5,%%v5,32\n\t" - "vag %%v5,%%v5,%%v4\n\t" - "vag %%v6,%%v6,%%v4\n\t" - "vfchesb %%v7,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v7\n\t" - "vsegf %%v8,%%v7\n\t" - "vesrlg %%v7,%%v7,32\n\t" - "vsegf %%v7,%%v7\n\t" - "vsel %%v1,%%v1,%%v5,%%v7\n\t" - "vsel %%v2,%%v2,%%v6,%%v8\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v3,%%v0,32\n\t" - "vfchsb %%v4,%%v3,%%v0\n\t" - "vchlg %%v5,%%v2,%%v1\n\t" - "vfcesb %%v6,%%v0,%%v3\n\t" - "vn %%v5,%%v5,%%v6\n\t" - "vo %%v4,%%v4,%%v5\n\t" - "vsel %%v0,%%v0,%%v3,%%v4\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v1,%%v2,%%v4\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcsb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vstef %%v0,%[min],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[imin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchsb %%v4,%%v0,%%v2\n\t" - "vesrlg %%v4,%%v4,32\n\t" - "vsegf %%v4,%%v4\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "ste %%f0,%[min]\n\t" - "vlgvg %[imin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [imin] "=r"(imin),[min] "=Q"(*min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v4", "v5", "v6", "v7", "v8", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return imin; -} - -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - BLASLONG min = 0; - - if (n <= 0 || inc_x <= 0) - return (min); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - min = ismin_kernel_64(n1, x, &minf); - - i = n1; - } else { - minf = x[0]; - i++; - } - - while (i < n) { - if (x[i] < minf) { - min = i; - minf = x[i]; - } - i++; - } - return (min + 1); - - } else { - - min = 0; - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - min = j; - minf = x[i]; - } - if (x[i + inc_x] < minf) { - min = j + 1; - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - min = j + 2; - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - min = j + 3; - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] < minf) { - min = j; - minf = x[i]; - } - i += inc_x; - j++; - } - return (min + 1); - } -} diff --git a/kernel/zarch/izamax.c b/kernel/zarch/izamax.c index daca1d6f7..216c3414a 100644 --- a/kernel/zarch/izamax.c +++ b/kernel/zarch/izamax.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,222 +24,243 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" #include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static BLASLONG izamax_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amax) { - BLASLONG iamax; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v16,%%v17\n\t" - "vfchedb %%v5,%%v18,%%v19\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v16,%%v17\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amax],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamax],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v2,%%v0\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amax]\n\t" - "vlgvg %[iamax],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamax] "=r"(iamax),[amax] "=Q"(*amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); - - return iamax; -} -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0; - BLASLONG max = 0; - BLASLONG inc_x2; - if (n <= 0 || inc_x <= 0) - return (max); + +/** + * Find maximum index + * Warning: requirements n>0 and n % 16 == 0 + * @param n + * @param x pointer to the vector + * @param maxf (out) maximum absolute value .( only for output ) + * @return index + */ +static BLASLONG ziamax_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *maxf) { + BLASLONG index; + __asm__( + "pfd 1, 0(%[ptr_x]) \n\t" + "vleig %%v16,0,0 \n\t" + "vleig %%v16,1,1 \n\t" + "vleig %%v17,2,0 \n\t" + "vleig %%v17,3,1 \n\t" + "vleig %%v18,4,0 \n\t" + "vleig %%v18,5,1 \n\t" + "vleig %%v19,6,0 \n\t" + "vleig %%v19,7,1 \n\t" + "vleig %%v20,8,0 \n\t" + "vleig %%v20,9,1 \n\t" + "vleig %%v21,10,0 \n\t" + "vleig %%v21,11,1 \n\t" + "vleig %%v22,12,0 \n\t" + "vleig %%v22,13,1 \n\t" + "vleig %%v23,14,0 \n\t" + "vleig %%v23,15,1 \n\t" + + + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vzero %%v6 \n\t" + "vzero %%v7 \n\t" + "vrepig %%v4,16 \n\t" + "vzero %%v5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + + "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" + + + "vleg %%v24 , 128(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 136(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 144(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 152(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 160(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 168(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 176(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 184(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 192(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 200(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 208(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 216(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 224(%[ptr_tmp]),0 \n\t" + "vleg %%v31 , 232(%[ptr_tmp]),0 \n\t" + "vleg %%v30 , 240(%[ptr_tmp]),1 \n\t" + "vleg %%v31 , 248(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" + + "vfchdb %%v25,%%v1,%%v0 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + + "vfchdb %%v27,%%v3,%%v2 \n\t " + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + + "vfchdb %%v25,%%v26,%%v24 \n\t" + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + + "vfchdb %%v27,%%v30,%%v28 \n\t" + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + + "vfchdb %%v24, %%v1,%%v31 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + + "vfchdb %%v30, %%v27,%%v3 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + + "vfchdb %%v0, %%v31,%%v28 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + + "vag %%v25,%%v25,%%v5 \n\t" + + //cmp with previous + "vfchdb %%v30, %%v27,%%v6 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + + "vag %%v5,%%v5,%%v4 \n\t" + + "clgrjl %[ptr_tmp],%%r0,1b \n\t" - if (inc_x == 1) { + //xtract index + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" + "wfcdb %%v26,%%v6 \n\t" + "jne 2f \n\t" + "vsteg %%v6,%[maxf],0 \n\t" + "vmnlg %%v1,%%v5,%%v7 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "j 3 \n\t" + "2: \n\t" + "wfchdb %%v16,%%v26,%%v6 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "std %%f0,%[maxf] \n\t" + "3: \n\t" + : [index] "+r"(index) ,[maxf] "=m"(*maxf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) + : "cc","r0", "f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" - BLASLONG n1 = n & -16; - if (n1 > 0) { + ); + return index; - max = izamax_kernel_16(n1, x, &maxf); - ix = n1 * 2; - i = n1; - } else { - maxf = CABS1(x, 0); - ix += 2; - i++; - } +} - while (i < n) { - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (max + 1); + - } else { + + - max = 0; - maxf = CABS1(x, 0); - inc_x2 = 2 * inc_x; +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i = 0; + BLASLONG ix = 0; + FLOAT maxf = 0; + BLASLONG max = 0; + BLASLONG inc_x2; - BLASLONG n1 = n & -4; - while (i < n1) { + if (n <= 0 || inc_x <= 0) return(max); + + if (inc_x == 1) { - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) > maxf) { - max = i + 1; - maxf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + 2 * inc_x2) > maxf) { - max = i + 2; - maxf = CABS1(x, ix + 2 * inc_x2); - } - if (CABS1(x, ix + 3 * inc_x2) > maxf) { - max = i + 3; - maxf = CABS1(x, ix + 3 * inc_x2); + BLASLONG n1 = n & -16; + if (n1 > 0) { + + max = ziamax_kernel_16_TUNED(n1, x, &maxf); + i = n1; + ix = n1 << 1; } - ix += inc_x2 * 4; + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (max + 1); - i += 4; + } else { + + inc_x2 = 2 * inc_x; - } + maxf = CABS1(x,0); + ix += inc_x2; + i++; - while (i < n) { - if (CABS1(x, ix) > maxf) { - max = i; - maxf = CABS1(x, ix); - } - ix += inc_x2; - i++; + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (max + 1); } - return (max + 1); - } + } + + diff --git a/kernel/zarch/izamin.c b/kernel/zarch/izamin.c index 9ababb91f..9b2a653a7 100644 --- a/kernel/zarch/izamin.c +++ b/kernel/zarch/izamin.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,222 +24,253 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" #include + +#define ABS fabs +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static BLASLONG izamin_kernel_16(BLASLONG n, FLOAT *x, FLOAT *amin) { - BLASLONG iamin; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v1,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v1,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v1,%%v1\n\t" - "vfadb %%v0,%%v0,%%v1\n\t" - "vleig %%v1,0,0\n\t" - "vleig %%v1,1,1\n\t" - "vrepig %%v2,8\n\t" - "vzero %%v3\n\t" - "vleig %%v24,0,0\n\t" - "vleig %%v24,1,1\n\t" - "vleig %%v25,2,0\n\t" - "vleig %%v25,3,1\n\t" - "vleig %%v26,4,0\n\t" - "vleig %%v26,5,1\n\t" - "vleig %%v27,6,0\n\t" - "vleig %%v27,7,1\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchedb %%v4,%%v17,%%v16\n\t" - "vfchedb %%v5,%%v19,%%v18\n\t" - "vsel %%v16,%%v16,%%v17,%%v4\n\t" - "vsel %%v4,%%v24,%%v25,%%v4\n\t" - "vsel %%v17,%%v18,%%v19,%%v5\n\t" - "vsel %%v5,%%v26,%%v27,%%v5\n\t" - "vfchedb %%v18,%%v17,%%v16\n\t" - "vsel %%v16,%%v16,%%v17,%%v18\n\t" - "vsel %%v4,%%v4,%%v5,%%v18\n\t" - "vag %%v4,%%v4,%%v3\n\t" - "vfchedb %%v5,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v5\n\t" - "vsel %%v1,%%v1,%%v4,%%v5\n\t" - "vag %%v3,%%v3,%%v2\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v2,%%v0,1\n\t" - "vrepg %%v3,%%v1,1\n\t" - "wfcdb %%v2,%%v0\n\t" - "jne 1f\n\t" - "vsteg %%v0,%[amin],0\n\t" - "vmnlg %%v0,%%v1,%%v3\n\t" - "vlgvg %[iamin],%%v0,0\n\t" - "j 2f\n\t" - "1:\n\t" - "wfchdb %%v4,%%v0,%%v2\n\t" - "vsel %%v1,%%v3,%%v1,%%v4\n\t" - "vsel %%v0,%%v2,%%v0,%%v4\n\t" - "std %%f0,%[amin]\n\t" - "vlgvg %[iamin],%%v1,0\n\t" - "2:\n\t" - "nop" - : [iamin] "=r"(iamin),[amin] "=Q"(*amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", - "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27"); - - return iamin; + +/** + * Find minimum index + * Warning: requirements n>0 and n % 16 == 0 + * @param n + * @param x pointer to the vector + * @param minf (out) minimum absolute value .( only for output ) + * @return minimum index + */ +static BLASLONG ziamin_kernel_16_TUNED(BLASLONG n, FLOAT *x, FLOAT *minf) { + BLASLONG index ; + __asm__( + "pfd 1, 0(%[ptr_x]) \n\t" + "vleig %%v16,0,0 \n\t" + "vleig %%v16,1,1 \n\t" + "vleig %%v17,2,0 \n\t" + "vleig %%v17,3,1 \n\t" + "vleig %%v18,4,0 \n\t" + "vleig %%v18,5,1 \n\t" + "vleig %%v19,6,0 \n\t" + "vleig %%v19,7,1 \n\t" + "vleig %%v20,8,0 \n\t" + "vleig %%v20,9,1 \n\t" + "vleig %%v21,10,0 \n\t" + "vleig %%v21,11,1 \n\t" + "vleig %%v22,12,0 \n\t" + "vleig %%v22,13,1 \n\t" + "vleig %%v23,14,0 \n\t" + "vleig %%v23,15,1 \n\t" + "ld %%f6,0(%[ptr_x]) \n\t" + "lpdbr %%f6,%%f6 \n\t" + "ld %%f7,8(%[ptr_x]) \n\t" + "lpdbr %%f7,%%f7 \n\t" + "adbr %%f6,%%f7 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vrepg %%v6,%%v6,0 \n\t" + "vzero %%v7 \n\t" + "vrepig %%v4,16 \n\t" + "vzero %%v5 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + + "vleg %%v24 , 0(%[ptr_tmp]),0 \n\t" + "vleg %%v25 , 8(%[ptr_tmp]),0 \n\t" + "vleg %%v24 , 16(%[ptr_tmp]),1 \n\t" + "vleg %%v25 , 24(%[ptr_tmp]),1 \n\t" + "vleg %%v26 , 32(%[ptr_tmp]),0 \n\t" + "vleg %%v27 , 40(%[ptr_tmp]),0 \n\t" + "vleg %%v26 , 48(%[ptr_tmp]),1 \n\t" + "vleg %%v27 , 56(%[ptr_tmp]),1 \n\t" + "vleg %%v28 , 64(%[ptr_tmp]),0 \n\t" + "vleg %%v29 , 72(%[ptr_tmp]),0 \n\t" + "vleg %%v28 , 80(%[ptr_tmp]),1 \n\t" + "vleg %%v29 , 88(%[ptr_tmp]),1 \n\t" + "vleg %%v30 , 96(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,104(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,112(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,120(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v24,%%v25 \n\t" + "vfadb %%v1,%%v26,%%v27 \n\t" + "vfadb %%v2,%%v28,%%v29 \n\t" + "vfadb %%v3,%%v30,%%v31 \n\t" + + + "vleg %%v24 ,128(%[ptr_tmp]),0 \n\t" + "vleg %%v25 ,136(%[ptr_tmp]),0 \n\t" + "vleg %%v24 ,144(%[ptr_tmp]),1 \n\t" + "vleg %%v25 ,152(%[ptr_tmp]),1 \n\t" + "vleg %%v26 ,160(%[ptr_tmp]),0 \n\t" + "vleg %%v27 ,168(%[ptr_tmp]),0 \n\t" + "vleg %%v26 ,176(%[ptr_tmp]),1 \n\t" + "vleg %%v27 ,184(%[ptr_tmp]),1 \n\t" + "vleg %%v28 ,192(%[ptr_tmp]),0 \n\t" + "vleg %%v29 ,200(%[ptr_tmp]),0 \n\t" + "vleg %%v28 ,208(%[ptr_tmp]),1 \n\t" + "vleg %%v29 ,216(%[ptr_tmp]),1 \n\t" + "vleg %%v30 ,224(%[ptr_tmp]),0 \n\t" + "vleg %%v31 ,232(%[ptr_tmp]),0 \n\t" + "vleg %%v30 ,240(%[ptr_tmp]),1 \n\t" + "vleg %%v31 ,248(%[ptr_tmp]),1 \n\t" + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v24,%%v24,%%v25 \n\t" + "vfadb %%v26,%%v26,%%v27 \n\t" + "vfadb %%v28,%%v28,%%v29 \n\t" + "vfadb %%v30,%%v30,%%v31 \n\t" + + + "vfchdb %%v25,%%v0 ,%%v1 \n\t" + "vsel %%v29,%%v17,%%v16,%%v25 \n\t" + "vsel %%v31,%%v1,%%v0,%%v25 \n\t" + + "vfchdb %%v27,%%v2,%%v3 \n\t" + "vsel %%v0,%%v19,%%v18,%%v27 \n\t" + "vsel %%v1,%%v3,%%v2,%%v27 \n\t" + + "vfchdb %%v25,%%v24,%%v26 \n\t" + "vsel %%v2,%%v21,%%v20,%%v25 \n\t" + "vsel %%v3,%%v26,%%v24,%%v25 \n\t" + + "vfchdb %%v27,%%v28,%%v30 \n\t" + "vsel %%v25,%%v23,%%v22,%%v27 \n\t" + "vsel %%v27,%%v30,%%v28,%%v27 \n\t" + + "vfchdb %%v24,%%v31, %%v1 \n\t" + "vsel %%v26,%%v0,%%v29,%%v24 \n\t" + "vsel %%v28,%%v1,%%v31,%%v24 \n\t" + + "vfchdb %%v30,%%v3, %%v27 \n\t" + "vsel %%v29,%%v25,%%v2,%%v30 \n\t" + "vsel %%v31,%%v27,%%v3 ,%%v30 \n\t" + + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + + "vfchdb %%v0,%%v28, %%v31 \n\t" + "vsel %%v25,%%v29,%%v26,%%v0 \n\t" + "vsel %%v27,%%v31,%%v28,%%v0 \n\t" + + "vag %%v25,%%v25,%%v5 \n\t" + + //cmp with previous + "vfchdb %%v30,%%v6 , %%v27 \n\t" + "vsel %%v7,%%v25,%%v7,%%v30 \n\t" + "vsel %%v6,%%v27,%%v6,%%v30 \n\t" + + "vag %%v5,%%v5,%%v4 \n\t" + + "clgrjl %[ptr_tmp],%%r0,1b \n\t" + + //xtract index + "vrepg %%v26,%%v6,1 \n\t" + "vrepg %%v5,%%v7,1 \n\t" + "wfcdb %%v26,%%v6 \n\t" + "jne 2f \n\t" + "vsteg %%v6,%[minf],0 \n\t" + "vmnlg %%v1,%%v5,%%v7 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "j 3f \n\t" + "2: \n\t" + "wfchdb %%v16,%%v6 ,%%v26 \n\t" + "vsel %%v1,%%v5,%%v7,%%v16 \n\t" + "vsel %%v0,%%v26,%%v6,%%v16 \n\t" + "vlgvg %[index],%%v1,0 \n\t" + "std %%f0,%[minf] \n\t" + "3: \n\t" + + : [index] "+r"(index) ,[minf] "=m"(*minf), [ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x), [n] "r"(n), [ptr_x] "r"(x) + : "cc","r0","f0","v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + + ); + + return index; } -BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0; - BLASLONG min = 0; - BLASLONG inc_x2; + - if (n <= 0 || inc_x <= 0) - return (min); + + - if (inc_x == 1) { +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; - BLASLONG n1 = n & -16; - if (n1 > 0) { + if (n <= 0 || inc_x <= 0) return(min); + - min = izamin_kernel_16(n1, x, &minf); - ix = n1 * 2; - i = n1; - } else { - minf = CABS1(x, 0); - ix += 2; - i++; - } + if (inc_x == 1) { - while (i < n) { - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (min + 1); - - } else { - - min = 0; - minf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) < minf) { - min = i + 1; - minf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + 2 * inc_x2) < minf) { - min = i + 2; - minf = CABS1(x, ix + 2 * inc_x2); - } - if (CABS1(x, ix + 3 * inc_x2) < minf) { - min = i + 3; - minf = CABS1(x, ix + 3 * inc_x2); - } - - ix += inc_x2 * 4; - - i += 4; + BLASLONG n1 = n & -16; + if (n1 > 0) { - } + min = ziamin_kernel_16_TUNED(n1, x, &minf); + i = n1; + ix = n1 << 1; + } + else { + //assign minf + minf = CABS1(x,0); + ix += 2; + i++; + } + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += 2; + i++; + } + return (min + 1); + + } else { + + inc_x2 = 2 * inc_x; - while (i < n) { - if (CABS1(x, ix) < minf) { - min = i; - minf = CABS1(x, ix); - } - ix += inc_x2; - i++; + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return (min + 1); } - return (min + 1); - } + } + + diff --git a/kernel/zarch/samax.c b/kernel/zarch/samax.c deleted file mode 100644 index fdda6dd32..000000000 --- a/kernel/zarch/samax.c +++ /dev/null @@ -1,152 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static FLOAT samax_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,8\n\t" - "vfmaxsb %%v17,%%v17,%%v25,8\n\t" - "vfmaxsb %%v18,%%v18,%%v26,8\n\t" - "vfmaxsb %%v19,%%v19,%%v27,8\n\t" - "vfmaxsb %%v20,%%v20,%%v28,8\n\t" - "vfmaxsb %%v21,%%v21,%%v29,8\n\t" - "vfmaxsb %%v22,%%v22,%%v30,8\n\t" - "vfmaxsb %%v23,%%v23,%%v31,8\n\t" - "vfmaxsb %%v16,%%v16,%%v20,8\n\t" - "vfmaxsb %%v17,%%v17,%%v21,8\n\t" - "vfmaxsb %%v18,%%v18,%%v22,8\n\t" - "vfmaxsb %%v19,%%v19,%%v23,8\n\t" - "vfmaxsb %%v16,%%v16,%%v18,8\n\t" - "vfmaxsb %%v17,%%v17,%%v19,8\n\t" - "vfmaxsb %%v16,%%v16,%%v17,8\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - maxf = samax_kernel_64(n1, x); - - i = n1; - } else { - maxf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i++; - } - return (maxf); - - } else { - - maxf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) > maxf) { - maxf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) > maxf) { - maxf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) > maxf) { - maxf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) > maxf) { - maxf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/samin.c b/kernel/zarch/samin.c deleted file mode 100644 index f05e851f9..000000000 --- a/kernel/zarch/samin.c +++ /dev/null @@ -1,152 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static FLOAT samin_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,8\n\t" - "vfminsb %%v17,%%v17,%%v25,8\n\t" - "vfminsb %%v18,%%v18,%%v26,8\n\t" - "vfminsb %%v19,%%v19,%%v27,8\n\t" - "vfminsb %%v20,%%v20,%%v28,8\n\t" - "vfminsb %%v21,%%v21,%%v29,8\n\t" - "vfminsb %%v22,%%v22,%%v30,8\n\t" - "vfminsb %%v23,%%v23,%%v31,8\n\t" - "vfminsb %%v16,%%v16,%%v20,8\n\t" - "vfminsb %%v17,%%v17,%%v21,8\n\t" - "vfminsb %%v18,%%v18,%%v22,8\n\t" - "vfminsb %%v19,%%v19,%%v23,8\n\t" - "vfminsb %%v16,%%v16,%%v18,8\n\t" - "vfminsb %%v17,%%v17,%%v19,8\n\t" - "vfminsb %%v16,%%v16,%%v17,8\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,8\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,8\n\t" - "lper %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - minf = samin_kernel_64(n1, x); - - i = n1; - } else { - minf = ABS(x[0]); - i++; - } - - while (i < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i++; - } - return (minf); - - } else { - - minf = ABS(x[0]); - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - if (ABS(x[i + inc_x]) < minf) { - minf = ABS(x[i + inc_x]); - } - if (ABS(x[i + 2 * inc_x]) < minf) { - minf = ABS(x[i + 2 * inc_x]); - } - if (ABS(x[i + 3 * inc_x]) < minf) { - minf = ABS(x[i + 3 * inc_x]); - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (ABS(x[i]) < minf) { - minf = ABS(x[i]); - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/sasum.c b/kernel/zarch/sasum.c deleted file mode 100644 index d56f2697b..000000000 --- a/kernel/zarch/sasum.c +++ /dev/null @@ -1,168 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define ABS fabsf - -static FLOAT sasum_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT asum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpsb %%v16, %%v16\n\t" - "vflpsb %%v17, %%v17\n\t" - "vflpsb %%v18, %%v18\n\t" - "vflpsb %%v19, %%v19\n\t" - "vflpsb %%v20, %%v20\n\t" - "vflpsb %%v21, %%v21\n\t" - "vflpsb %%v22, %%v22\n\t" - "vflpsb %%v23, %%v23\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return asum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) - return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { - - sumf = sasum_kernel_64(n1, x); - i = n1; - } - - while (i < n) { - sumf += ABS(x[i]); - i++; - } - - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { - - sum1 += ABS(x[i]); - sum2 += ABS(x[i + inc_x]); - sum1 += ABS(x[i + 2 * inc_x]); - sum2 += ABS(x[i + 3 * inc_x]); - - i += inc_x * 4; - j += 4; - - } - sumf = sum1 + sum2; - while (j < n) { - - sumf += ABS(x[i]); - i += inc_x; - j++; - } - - } - return sumf; -} diff --git a/kernel/zarch/saxpy.c b/kernel/zarch/saxpy.c deleted file mode 100644 index ca34a47ff..000000000 --- a/kernel/zarch/saxpy.c +++ /dev/null @@ -1,167 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - __asm__("vlrepf %%v0,%[alpha]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,0(%%r1,%[y])\n\t" - "vl %%v21,16(%%r1,%[y])\n\t" - "vl %%v22,32(%%r1,%[y])\n\t" - "vl %%v23,48(%%r1,%[y])\n\t" - "vl %%v24,64(%%r1,%[x])\n\t" - "vl %%v25,80(%%r1,%[x])\n\t" - "vl %%v26,96(%%r1,%[x])\n\t" - "vl %%v27,112(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,0(%%r1,%[y])\n\t" - "vst %%v17,16(%%r1,%[y])\n\t" - "vst %%v18,32(%%r1,%[y])\n\t" - "vst %%v19,48(%%r1,%[y])\n\t" - "vst %%v24,64(%%r1,%[y])\n\t" - "vst %%v25,80(%%r1,%[y])\n\t" - "vst %%v26,96(%%r1,%[y])\n\t" - "vst %%v27,112(%%r1,%[y])\n\t" - "vl %%v16,128(%%r1,%[x])\n\t" - "vl %%v17,144(%%r1,%[x])\n\t" - "vl %%v18,160(%%r1,%[x])\n\t" - "vl %%v19,176(%%r1,%[x])\n\t" - "vl %%v20,128(%%r1,%[y])\n\t" - "vl %%v21,144(%%r1,%[y])\n\t" - "vl %%v22,160(%%r1,%[y])\n\t" - "vl %%v23,176(%%r1,%[y])\n\t" - "vl %%v24,192(%%r1,%[x])\n\t" - "vl %%v25,208(%%r1,%[x])\n\t" - "vl %%v26,224(%%r1,%[x])\n\t" - "vl %%v27,240(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[y])\n\t" - "vl %%v29,208(%%r1,%[y])\n\t" - "vl %%v30,224(%%r1,%[y])\n\t" - "vl %%v31,240(%%r1,%[y])\n\t" - "vfmasb %%v16,%%v0,%%v16,%%v20\n\t" - "vfmasb %%v17,%%v0,%%v17,%%v21\n\t" - "vfmasb %%v18,%%v0,%%v18,%%v22\n\t" - "vfmasb %%v19,%%v0,%%v19,%%v23\n\t" - "vfmasb %%v24,%%v0,%%v24,%%v28\n\t" - "vfmasb %%v25,%%v0,%%v25,%%v29\n\t" - "vfmasb %%v26,%%v0,%%v26,%%v30\n\t" - "vfmasb %%v27,%%v0,%%v27,%%v31\n\t" - "vst %%v16,128(%%r1,%[y])\n\t" - "vst %%v17,144(%%r1,%[y])\n\t" - "vst %%v18,160(%%r1,%[y])\n\t" - "vst %%v19,176(%%r1,%[y])\n\t" - "vst %%v24,192(%%r1,%[y])\n\t" - "vst %%v25,208(%%r1,%[y])\n\t" - "vst %%v26,224(%%r1,%[y])\n\t" - "vst %%v27,240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - [alpha] "Q"(*alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, - BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - if (n <= 0) - return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - - if (n1) - saxpy_kernel_64(n1, x, y, &da); - - i = n1; - while (i < n) { - - y[i] += da * x[i]; - i++; - - } - return 0; - - } - - BLASLONG n1 = n & -4; - - while (i < n1) { - - FLOAT m1 = da * x[ix]; - FLOAT m2 = da * x[ix + inc_x]; - FLOAT m3 = da * x[ix + 2 * inc_x]; - FLOAT m4 = da * x[ix + 3 * inc_x]; - - y[iy] += m1; - y[iy + inc_y] += m2; - y[iy + 2 * inc_y] += m3; - y[iy + 3 * inc_y] += m4; - - ix += inc_x * 4; - iy += inc_y * 4; - i += 4; - - } - - while (i < n) { - - y[iy] += da * x[ix]; - ix += inc_x; - iy += inc_y; - i++; - - } - return 0; - -} diff --git a/kernel/zarch/scopy.c b/kernel/zarch/scopy.c deleted file mode 100644 index 5c453cfbb..000000000 --- a/kernel/zarch/scopy.c +++ /dev/null @@ -1,79 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void scopy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],6\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) y),[x] "+&a"(x),[y] "+&a"(y),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x) - : "cc"); -} - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - if (n <= 0) - return 0; - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - scopy_kernel_64(n1, x, y); - i = n1; - } - - while (i < n) { - y[i] = x[i]; - i++; - - } - - } else { - - while (i < n) { - - y[iy] = x[ix]; - ix += inc_x; - iy += inc_y; - i++; - - } - - } - return 0; - -} diff --git a/kernel/zarch/sdot.c b/kernel/zarch/sdot.c deleted file mode 100644 index d870b30f0..000000000 --- a/kernel/zarch/sdot.c +++ /dev/null @@ -1,144 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019,The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms,with or without -modification,are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice,this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice,this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES,INCLUDING,BUT NOT LIMITED TO,THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT,INDIRECT,INCIDENTAL,SPECIAL,EXEMPLARY,OR CONSEQUENTIAL -DAMAGES (INCLUDING,BUT NOT LIMITED TO,PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE,DATA,OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY,WHETHER IN CONTRACT,STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE,EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT sdot_kernel_32(BLASLONG n, FLOAT *x, FLOAT *y) { - FLOAT dot; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "pfd 1,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "vrepf %%v1,%%v0,1\n\t" - "vrepf %%v2,%%v0,2\n\t" - "vrepf %%v3,%%v0,3\n\t" - "aebr %%f0,%%f1\n\t" - "aebr %%f0,%%f2\n\t" - "aebr %%f0,%%f3\n\t" - "ler %[dot],%%f0" - : [dot] "=f"(dot),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n]; } *) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); - - return dot; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - FLOAT dot = 0.0; - - if (n <= 0) - return (dot); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -32; - - if (n1) - dot = sdot_kernel_32(n1, x, y); - - i = n1; - while (i < n) { - - dot += y[i] * x[i]; - i++; - - } - return (dot); - - } - - BLASLONG n1 = n & -2; - - while (i < n1) { - - dot += y[iy] * x[ix] + y[iy + inc_y] * x[ix + inc_x]; - ix += inc_x * 2; - iy += inc_y * 2; - i += 2; - - } - - while (i < n) { - - dot += y[iy] * x[ix]; - ix += inc_x; - iy += inc_y; - i++; - - } - return (dot); - -} diff --git a/kernel/zarch/sgemv_n_4.c b/kernel/zarch/sgemv_n_4.c deleted file mode 100644 index a1efef373..000000000 --- a/kernel/zarch/sgemv_n_4.c +++ /dev/null @@ -1,597 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,8(%[x])\n\t" - "vlrepf %%v3,12(%[x])\n\t" - "vlrepf %%v4,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v4\n\t" - "vfmsb %%v1,%%v1,%%v4\n\t" - "vfmsb %%v2,%%v2,%%v4\n\t" - "vfmsb %%v3,%%v3,%%v4\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v20,16(%%r1,%[ap0])\n\t" - "vl %%v21,16(%%r1,%[ap1])\n\t" - "vl %%v22,16(%%r1,%[ap2])\n\t" - "vl %%v23,16(%%r1,%[ap3])\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vl %%v5,16(%%r1,%[y])\n\t" - "vl %%v6,32(%%r1,%[y])\n\t" - "vl %%v7,48(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "vst %%v5,16(%%r1,%[y])\n\t" - "vst %%v6,32(%%r1,%[y])\n\t" - "vst %%v7,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[ap0])\n\t" - "vl %%v17,64(%%r1,%[ap1])\n\t" - "vl %%v18,64(%%r1,%[ap2])\n\t" - "vl %%v19,64(%%r1,%[ap3])\n\t" - "vl %%v20,80(%%r1,%[ap0])\n\t" - "vl %%v21,80(%%r1,%[ap1])\n\t" - "vl %%v22,80(%%r1,%[ap2])\n\t" - "vl %%v23,80(%%r1,%[ap3])\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vl %%v4,64(%%r1,%[y])\n\t" - "vl %%v5,80(%%r1,%[y])\n\t" - "vl %%v6,96(%%r1,%[y])\n\t" - "vl %%v7,112(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v20,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v28,%%v0,%%v7\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v21,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v29,%%v1,%%v7\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v2,%%v5\n\t" - "vfmasb %%v6,%%v26,%%v2,%%v6\n\t" - "vfmasb %%v7,%%v30,%%v2,%%v7\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v3,%%v5\n\t" - "vfmasb %%v6,%%v27,%%v3,%%v6\n\t" - "vfmasb %%v7,%%v31,%%v3,%%v7\n\t" - "vst %%v4,64(%%r1,%[y])\n\t" - "vst %%v5,80(%%r1,%[y])\n\t" - "vst %%v6,96(%%r1,%[y])\n\t" - "vst %%v7,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,0(%%r1,%[ap2])\n\t" - "vl %%v19,0(%%r1,%[ap3])\n\t" - "vl %%v4,0(%%r1,%[y])\n\t" - "vfmasb %%v4,%%v16,%%v0,%%v4\n\t" - "vfmasb %%v4,%%v17,%%v1,%%v4\n\t" - "vfmasb %%v4,%%v18,%%v2,%%v4\n\t" - "vfmasb %%v4,%%v19,%%v3,%%v4\n\t" - "vst %%v4,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); -} - -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v1,4(%[x])\n\t" - "vlrepf %%v2,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v2\n\t" - "vfmsb %%v1,%%v1,%%v2\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v18,16(%%r1,%[ap0])\n\t" - "vl %%v19,16(%%r1,%[ap1])\n\t" - "vl %%v20,32(%%r1,%[ap0])\n\t" - "vl %%v21,32(%%r1,%[ap1])\n\t" - "vl %%v22,48(%%r1,%[ap0])\n\t" - "vl %%v23,48(%%r1,%[ap1])\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vl %%v3,16(%%r1,%[y])\n\t" - "vl %%v4,32(%%r1,%[y])\n\t" - "vl %%v5,48(%%r1,%[y])\n\t" - "vl %%v6,64(%%r1,%[y])\n\t" - "vl %%v7,80(%%r1,%[y])\n\t" - "vl %%v8,96(%%r1,%[y])\n\t" - "vl %%v9,112(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v3,%%v18,%%v0,%%v3\n\t" - "vfmasb %%v4,%%v20,%%v0,%%v4\n\t" - "vfmasb %%v5,%%v22,%%v0,%%v5\n\t" - "vfmasb %%v6,%%v24,%%v0,%%v6\n\t" - "vfmasb %%v7,%%v26,%%v0,%%v7\n\t" - "vfmasb %%v8,%%v28,%%v0,%%v8\n\t" - "vfmasb %%v9,%%v30,%%v0,%%v9\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vfmasb %%v3,%%v19,%%v1,%%v3\n\t" - "vfmasb %%v4,%%v21,%%v1,%%v4\n\t" - "vfmasb %%v5,%%v23,%%v1,%%v5\n\t" - "vfmasb %%v6,%%v25,%%v1,%%v6\n\t" - "vfmasb %%v7,%%v27,%%v1,%%v7\n\t" - "vfmasb %%v8,%%v29,%%v1,%%v8\n\t" - "vfmasb %%v9,%%v31,%%v1,%%v9\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "vst %%v3,16(%%r1,%[y])\n\t" - "vst %%v4,32(%%r1,%[y])\n\t" - "vst %%v5,48(%%r1,%[y])\n\t" - "vst %%v6,64(%%r1,%[y])\n\t" - "vst %%v7,80(%%r1,%[y])\n\t" - "vst %%v8,96(%%r1,%[y])\n\t" - "vst %%v9,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[ap0])\n\t" - "vl %%v17,0(%%r1,%[ap1])\n\t" - "vl %%v2,0(%%r1,%[y])\n\t" - "vfmasb %%v2,%%v16,%%v0,%%v2\n\t" - "vfmasb %%v2,%%v17,%%v1,%%v2\n\t" - "vst %%v2,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v8", "v9", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - __asm__("vlrepf %%v0,0(%[x])\n\t" - "vlrepf %%v16,%[alpha]\n\t" - "vfmsb %%v0,%%v0,%%v16\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,16(%%r1,%[a0])\n\t" - "vl %%v18,32(%%r1,%[a0])\n\t" - "vl %%v19,48(%%r1,%[a0])\n\t" - "vl %%v20,64(%%r1,%[a0])\n\t" - "vl %%v21,80(%%r1,%[a0])\n\t" - "vl %%v22,96(%%r1,%[a0])\n\t" - "vl %%v23,112(%%r1,%[a0])\n\t" - "vl %%v24,0(%%r1,%[y])\n\t" - "vl %%v25,16(%%r1,%[y])\n\t" - "vl %%v26,32(%%r1,%[y])\n\t" - "vl %%v27,48(%%r1,%[y])\n\t" - "vl %%v28,64(%%r1,%[y])\n\t" - "vl %%v29,80(%%r1,%[y])\n\t" - "vl %%v30,96(%%r1,%[y])\n\t" - "vl %%v31,112(%%r1,%[y])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v24,0(%%r1,%[y])\n\t" - "vst %%v25,16(%%r1,%[y])\n\t" - "vst %%v26,32(%%r1,%[y])\n\t" - "vst %%v27,48(%%r1,%[y])\n\t" - "vst %%v28,64(%%r1,%[y])\n\t" - "vst %%v29,80(%%r1,%[y])\n\t" - "vst %%v30,96(%%r1,%[y])\n\t" - "vst %%v31,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[a0])\n\t" - "vl %%v17,0(%%r1,%[y])\n\t" - "vfmasb %%v17,%%v16,%%v0,%%v17\n\t" - "vst %%v17,0(%%r1,%[y])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const FLOAT (*)[1]) x),[x] "a"(x),[alpha] "Q"(*alpha), - [n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest += src[i]; - dest += inc_dest; - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, - BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *buffer) { - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4 = lda << 2; - FLOAT xbuffer[8], *ybuffer; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - ybuffer = buffer; - - n1 = n >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m & -4; - m2 = (m & (NBMAX - 1)) - m3; - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } - - a_ptr = a; - x_ptr = x; - - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if (inc_y != 1) - memset(ybuffer, 0, NB * 4); - else - ybuffer = y_ptr; - - if (inc_x == 1) { - - for (i = 0; i < n1; i++) { - sgemv_kernel_4x4(NB, ap, x_ptr, ybuffer, &alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 4; - } - - if (n2 & 2) { - sgemv_kernel_4x2(NB, ap, x_ptr, ybuffer, &alpha); - a_ptr += lda * 2; - x_ptr += 2; - } - - if (n2 & 1) { - sgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer, &alpha); - /* a_ptr += lda; - x_ptr += 1; */ - - } - - } else { - - for (i = 0; i < n1; i++) { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[1] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - x_ptr += inc_x; - xbuffer[3] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, &alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for (i = 0; i < n2; i++) { - xbuffer[0] = x_ptr[0]; - x_ptr += inc_x; - sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, &alpha); - a_ptr += lda; - - } - - } - - a += NB; - if (inc_y != 1) { - add_y(NB, ybuffer, y_ptr, inc_y); - y_ptr += NB * inc_y; - } else - y_ptr += NB; - - } - - if (m3 == 0) - return (0); - - if (m3 == 3) { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - FLOAT temp2 = 0.0; - if (lda == 3 && inc_x == 1) { - - for (i = 0; i < (n & -4); i += 4) { - - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; - temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - - temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; - temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; - temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; - - a_ptr += 12; - x_ptr += 4; - } - - for (; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += 3; - x_ptr++; - } - - } else { - - for (i = 0; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - temp2 += a_ptr[2] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - y_ptr += inc_y; - y_ptr[0] += alpha * temp2; - return (0); - } - - if (m3 == 2) { - a_ptr = a; - x_ptr = x; - FLOAT temp0 = 0.0; - FLOAT temp1 = 0.0; - if (lda == 2 && inc_x == 1) { - - for (i = 0; i < (n & -4); i += 4) { - temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; - temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; - temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - a_ptr += 8; - x_ptr += 4; - - } - - for (; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += 2; - x_ptr++; - } - - } else { - - for (i = 0; i < n; i++) { - temp0 += a_ptr[0] * x_ptr[0]; - temp1 += a_ptr[1] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - - } - - } - y_ptr[0] += alpha * temp0; - y_ptr += inc_y; - y_ptr[0] += alpha * temp1; - return (0); - } - - if (m3 == 1) { - a_ptr = a; - x_ptr = x; - FLOAT temp = 0.0; - if (lda == 1 && inc_x == 1) { - - for (i = 0; i < (n & -4); i += 4) { - temp += - a_ptr[i] * x_ptr[i] + a_ptr[i + 1] * x_ptr[i + 1] + a_ptr[i + - 2] * - x_ptr[i + 2] + a_ptr[i + 3] * x_ptr[i + 3]; - - } - - for (; i < n; i++) { - temp += a_ptr[i] * x_ptr[i]; - } - - } else { - - for (i = 0; i < n; i++) { - temp += a_ptr[0] * x_ptr[0]; - a_ptr += lda; - x_ptr += inc_x; - } - - } - y_ptr[0] += alpha * temp; - return (0); - } - - return (0); -} diff --git a/kernel/zarch/sgemv_t_4.c b/kernel/zarch/sgemv_t_4.c deleted file mode 100644 index 81d7c9fe7..000000000 --- a/kernel/zarch/sgemv_t_4.c +++ /dev/null @@ -1,753 +0,0 @@ -/*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#define NBMAX 2048 - -static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "vl %%v28,16(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v17,%%v28,%%v4\n\t" - "vl %%v29,16(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v17,%%v29,%%v5\n\t" - "vl %%v30,16(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v17,%%v30,%%v6\n\t" - "vl %%v31,16(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v17,%%v31,%%v7\n\t" - "vl %%v24,32(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v18,%%v24,%%v0\n\t" - "vl %%v25,32(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v18,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,32(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v18,%%v27,%%v3\n\t" - "vl %%v28,48(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v19,%%v28,%%v4\n\t" - "vl %%v29,48(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v19,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,64(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v20,%%v26,%%v2\n\t" - "vl %%v27,64(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v20,%%v27,%%v3\n\t" - "vl %%v28,80(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v21,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,80(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v21,%%v30,%%v6\n\t" - "vl %%v31,80(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v21,%%v31,%%v7\n\t" - "vl %%v24,96(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v22,%%v24,%%v0\n\t" - "vl %%v25,96(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v22,%%v25,%%v1\n\t" - "vl %%v26,96(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v22,%%v26,%%v2\n\t" - "vl %%v27,96(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v22,%%v27,%%v3\n\t" - "vl %%v28,112(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v23,%%v28,%%v4\n\t" - "vl %%v29,112(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v23,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap2])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap3])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,0(%%r1,%[ap2])\n\t" - "vfmasb %%v2,%%v16,%%v26,%%v2\n\t" - "vl %%v27,0(%%r1,%[ap3])\n\t" - "vfmasb %%v3,%%v16,%%v27,%%v3\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v2,%%v2,%%v6\n\t" - "vfasb %%v3,%%v3,%%v7\n\t" - "veslg %%v4,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vrepg %%v4,%%v0,1\n\t" - "aebr %%f0,%%f4\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v4,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v4\n\t" - "vrepg %%v4,%%v1,1\n\t" - "aebr %%f1,%%f4\n\t" - "ste %%f1,4(%[y])\n\t" - "veslg %%v4,%%v2,32\n\t" - "vfasb %%v2,%%v2,%%v4\n\t" - "vrepg %%v4,%%v2,1\n\t" - "aebr %%f2,%%f4\n\t" - "ste %%f2,8(%[y])\n\t" - "veslg %%v4,%%v3,32\n\t" - "vfasb %%v3,%%v3,%%v4\n\t" - "vrepg %%v4,%%v3,1\n\t" - "aebr %%f3,%%f4\n\t" - "ste %%f3,12(%[y])" - : "=m"(*(struct { FLOAT x[4]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); -} - -static void sgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "vl %%v26,16(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v17,%%v26,%%v2\n\t" - "vl %%v27,16(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v17,%%v27,%%v3\n\t" - "vl %%v28,32(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v18,%%v28,%%v4\n\t" - "vl %%v29,32(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v18,%%v29,%%v5\n\t" - "vl %%v30,48(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v19,%%v30,%%v6\n\t" - "vl %%v31,48(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v19,%%v31,%%v7\n\t" - "vl %%v24,64(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v20,%%v24,%%v0\n\t" - "vl %%v25,64(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v20,%%v25,%%v1\n\t" - "vl %%v26,80(%%r1,%[ap0])\n\t" - "vfmasb %%v2,%%v21,%%v26,%%v2\n\t" - "vl %%v27,80(%%r1,%[ap1])\n\t" - "vfmasb %%v3,%%v21,%%v27,%%v3\n\t" - "vl %%v28,96(%%r1,%[ap0])\n\t" - "vfmasb %%v4,%%v22,%%v28,%%v4\n\t" - "vl %%v29,96(%%r1,%[ap1])\n\t" - "vfmasb %%v5,%%v22,%%v29,%%v5\n\t" - "vl %%v30,112(%%r1,%[ap0])\n\t" - "vfmasb %%v6,%%v23,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[ap1])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[ap0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,0(%%r1,%[ap1])\n\t" - "vfmasb %%v1,%%v16,%%v25,%%v1\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v1,%%v1,%%v3\n\t" - "vfasb %%v1,%%v1,%%v5\n\t" - "vfasb %%v1,%%v1,%%v7\n\t" - "veslg %%v2,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vrepg %%v2,%%v0,1\n\t" - "aebr %%f0,%%f2\n\t" - "ste %%f0,0(%[y])\n\t" - "veslg %%v2,%%v1,32\n\t" - "vfasb %%v1,%%v1,%%v2\n\t" - "vrepg %%v2,%%v1,1\n\t" - "aebr %%f1,%%f2\n\t" - "ste %%f1,4(%[y])" - : "=m"(*(struct { FLOAT x[2]; } *) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); -} - -static void sgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *x, FLOAT *y) { - __asm__("vzero %%v0\n\t" - "vzero %%v1\n\t" - "vzero %%v2\n\t" - "vzero %%v3\n\t" - "vzero %%v4\n\t" - "vzero %%v5\n\t" - "vzero %%v6\n\t" - "vzero %%v7\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[a0])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "vl %%v25,16(%%r1,%[a0])\n\t" - "vfmasb %%v1,%%v17,%%v25,%%v1\n\t" - "vl %%v26,32(%%r1,%[a0])\n\t" - "vfmasb %%v2,%%v18,%%v26,%%v2\n\t" - "vl %%v27,48(%%r1,%[a0])\n\t" - "vfmasb %%v3,%%v19,%%v27,%%v3\n\t" - "vl %%v28,64(%%r1,%[a0])\n\t" - "vfmasb %%v4,%%v20,%%v28,%%v4\n\t" - "vl %%v29,80(%%r1,%[a0])\n\t" - "vfmasb %%v5,%%v21,%%v29,%%v5\n\t" - "vl %%v30,96(%%r1,%[a0])\n\t" - "vfmasb %%v6,%%v22,%%v30,%%v6\n\t" - "vl %%v31,112(%%r1,%[a0])\n\t" - "vfmasb %%v7,%%v23,%%v31,%%v7\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[a0])\n\t" - "vfmasb %%v0,%%v16,%%v24,%%v0\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vfasb %%v0,%%v0,%%v2\n\t" - "vfasb %%v0,%%v0,%%v3\n\t" - "vfasb %%v0,%%v0,%%v4\n\t" - "vfasb %%v0,%%v0,%%v5\n\t" - "vfasb %%v0,%%v0,%%v6\n\t" - "vfasb %%v0,%%v0,%%v7\n\t" - "veslg %%v1,%%v0,32\n\t" - "vfasb %%v0,%%v0,%%v1\n\t" - "vrepg %%v1,%%v0,1\n\t" - "aebr %%f0,%%f1\n\t" - "ste %%f0,0(%[y])" - : "=m"(*(FLOAT (*)[1]) y) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n]; } *) a0),[a0] "a"(a0), - "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28", "v29", "v30", "v31"); -} - -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - dest[i] = *src; - src += inc_src; - } -} - -static void add_y_kernel_4(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest) { - __asm__("vlrepf %%v0,%[da]\n\t" - "xgr %%r1,%%r1\n\t" - "lghi %%r0,-32\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 1f\n\t" - "srlg %%r0,%%r0,5\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,64(%%r1,%[src])\n\t" - "vl %%v21,80(%%r1,%[src])\n\t" - "vl %%v22,96(%%r1,%[src])\n\t" - "vl %%v23,112(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "vl %%v25, 16(%%r1,%[dest])\n\t" - "vfmasb %%v25,%%v17,%%v0,%%v25\n\t" - "vst %%v25, 16(%%r1,%[dest])\n\t" - "vl %%v26, 32(%%r1,%[dest])\n\t" - "vfmasb %%v26,%%v18,%%v0,%%v26\n\t" - "vst %%v26, 32(%%r1,%[dest])\n\t" - "vl %%v27, 48(%%r1,%[dest])\n\t" - "vfmasb %%v27,%%v19,%%v0,%%v27\n\t" - "vst %%v27, 48(%%r1,%[dest])\n\t" - "vl %%v28, 64(%%r1,%[dest])\n\t" - "vfmasb %%v28,%%v20,%%v0,%%v28\n\t" - "vst %%v28, 64(%%r1,%[dest])\n\t" - "vl %%v29, 80(%%r1,%[dest])\n\t" - "vfmasb %%v29,%%v21,%%v0,%%v29\n\t" - "vst %%v29, 80(%%r1,%[dest])\n\t" - "vl %%v30, 96(%%r1,%[dest])\n\t" - "vfmasb %%v30,%%v22,%%v0,%%v30\n\t" - "vst %%v30, 96(%%r1,%[dest])\n\t" - "vl %%v31, 112(%%r1,%[dest])\n\t" - "vfmasb %%v31,%%v23,%%v0,%%v31\n\t" - "vst %%v31, 112(%%r1,%[dest])\n\t" - "agfi %%r1,128\n\t" - "brctg %%r0,0b\n\t" - "1:\n\t" - "lghi %%r0,28\n\t" - "ngr %%r0,%[n]\n\t" - "ltgr %%r0,%%r0\n\t" - "jz 3f\n\t" - "srlg %%r0,%%r0,2\n\t" - "2:\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v24, 0(%%r1,%[dest])\n\t" - "vfmasb %%v24,%%v16,%%v0,%%v24\n\t" - "vst %%v24, 0(%%r1,%[dest])\n\t" - "agfi %%r1,16\n\t" - "brctg %%r0,2b\n\t" - "3:\n\t" - "nop" - : "+m"(*(struct { FLOAT x[n]; } *) dest) - : [dest] "a"(dest),[da] "Q"(da), "m"(*(const struct { FLOAT x[n]; } *) src), - [src] "a"(src),[n] "r"(n) - : "cc", "r0", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} -static void add_y(BLASLONG n, FLOAT da, FLOAT *src, FLOAT *dest, - BLASLONG inc_dest) { - if (inc_dest == 1) - add_y_kernel_4(n, da, src, dest); - else { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest += src[i] * da; - dest += inc_dest; - } - } -} - -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, - BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *buffer) { - BLASLONG register i; - BLASLONG register j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - BLASLONG n0; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - FLOAT ybuffer[2] __attribute__ ((aligned(16))); - FLOAT *xbuffer; - FLOAT *ytemp; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - xbuffer = buffer; - ytemp = buffer + (m < NBMAX ? m : NBMAX); - - n0 = n / NBMAX; - n1 = (n % NBMAX) >> 2; - n2 = n & 3; - - m3 = m & 3; - m1 = m & -4; - m2 = (m & (NBMAX - 1)) - m3; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } - - y_ptr = y; - a_ptr = a; - x_ptr = x; - - if (inc_x == 1) - xbuffer = x_ptr; - else - copy_x(NB, x_ptr, xbuffer, inc_x); - - FLOAT *ap[4]; - FLOAT *yp; - BLASLONG register lda4 = 4 * lda; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - - if (n0 > 0) { - BLASLONG nb1 = NBMAX / 4; - for (j = 0; j < n0; j++) { - - yp = ytemp; - for (i = 0; i < nb1; i++) { - sgemv_kernel_4x4(NB, ap, xbuffer, yp); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - yp += 4; - } - add_y(nb1 * 4, alpha, ytemp, y_ptr, inc_y); - y_ptr += nb1 * inc_y * 4; - a_ptr += nb1 * lda4; - - } - - } - - yp = ytemp; - - for (i = 0; i < n1; i++) { - sgemv_kernel_4x4(NB, ap, xbuffer, yp); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - yp += 4; - } - if (n1 > 0) { - add_y(n1 * 4, alpha, ytemp, y_ptr, inc_y); - y_ptr += n1 * inc_y * 4; - a_ptr += n1 * lda4; - } - - if (n2 & 2) { - - sgemv_kernel_4x2(NB, ap, xbuffer, ybuffer); - a_ptr += lda * 2; - *y_ptr += ybuffer[0] * alpha; - y_ptr += inc_y; - *y_ptr += ybuffer[1] * alpha; - y_ptr += inc_y; - - } - - if (n2 & 1) { - - sgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); - // a_ptr += lda; - *y_ptr += ybuffer[0] * alpha; - // y_ptr += inc_y; - - } - a += NB; - x += NB * inc_x; - } - - if (m3 == 0) - return (0); - - x_ptr = x; - a_ptr = a; - if (m3 == 3) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp2 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 3 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - y_ptr[j + 1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; - y_ptr[j + 2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; - y_ptr[j + 3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; - aj += 12; - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; - aj += 3; - } - - } else { - - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr[j + 1] += - *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1 + *(aj + lda + - 2) * xtemp2; - y_ptr[j + 2] += - *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1 + *(aj + lda2 + - 2) * xtemp2; - y_ptr[j + 3] += - *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1 + *(aj + lda3 + - 2) * xtemp2; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - aj += lda; - } - - } else { - - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1 + *(aj + 2) * xtemp2; - y_ptr += inc_y; - aj += lda; - } - - } - - } - return (0); - } - - if (m3 == 2) { - FLOAT xtemp0 = *x_ptr * alpha; - x_ptr += inc_x; - FLOAT xtemp1 = *x_ptr * alpha; - - FLOAT *aj = a_ptr; - y_ptr = y; - - if (lda == 2 && inc_y == 1) { - - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - y_ptr[j + 1] += aj[2] * xtemp0 + aj[3] * xtemp1; - y_ptr[j + 2] += aj[4] * xtemp0 + aj[5] * xtemp1; - y_ptr[j + 3] += aj[6] * xtemp0 + aj[7] * xtemp1; - aj += 8; - - } - - for (; j < n; j++) { - y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1; - aj += 2; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - - for (j = 0; j < (n & -4); j += 4) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr[j + 1] += *(aj + lda) * xtemp0 + *(aj + lda + 1) * xtemp1; - y_ptr[j + 2] += *(aj + lda2) * xtemp0 + *(aj + lda2 + 1) * xtemp1; - y_ptr[j + 3] += *(aj + lda3) * xtemp0 + *(aj + lda3 + 1) * xtemp1; - aj += lda4; - } - - for (; j < n; j++) { - - y_ptr[j] += *aj * xtemp0 + *(aj + 1) * xtemp1; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp0 + *(aj + 1) * xtemp1; - y_ptr += inc_y; - aj += lda; - } - } - - } - return (0); - - } - - FLOAT xtemp = *x_ptr * alpha; - FLOAT *aj = a_ptr; - y_ptr = y; - if (lda == 1 && inc_y == 1) { - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += aj[j] * xtemp; - y_ptr[j + 1] += aj[j + 1] * xtemp; - y_ptr[j + 2] += aj[j + 2] * xtemp; - y_ptr[j + 3] += aj[j + 3] * xtemp; - } - for (; j < n; j++) { - y_ptr[j] += aj[j] * xtemp; - } - - } else { - if (inc_y == 1) { - - BLASLONG register lda2 = lda << 1; - BLASLONG register lda4 = lda << 2; - BLASLONG register lda3 = lda2 + lda; - for (j = 0; j < (n & -4); j += 4) { - y_ptr[j] += *aj * xtemp; - y_ptr[j + 1] += *(aj + lda) * xtemp; - y_ptr[j + 2] += *(aj + lda2) * xtemp; - y_ptr[j + 3] += *(aj + lda3) * xtemp; - aj += lda4; - } - - for (; j < n; j++) { - y_ptr[j] += *aj * xtemp; - aj += lda; - } - - } else { - for (j = 0; j < n; j++) { - *y_ptr += *aj * xtemp; - y_ptr += inc_y; - aj += lda; - } - - } - } - - return (0); -} diff --git a/kernel/zarch/smax.c b/kernel/zarch/smax.c deleted file mode 100644 index 7015aaa1d..000000000 --- a/kernel/zarch/smax.c +++ /dev/null @@ -1,149 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT smax_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT max; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfmaxsb %%v16,%%v16,%%v24,0\n\t" - "vfmaxsb %%v17,%%v17,%%v25,0\n\t" - "vfmaxsb %%v18,%%v18,%%v26,0\n\t" - "vfmaxsb %%v19,%%v19,%%v27,0\n\t" - "vfmaxsb %%v20,%%v20,%%v28,0\n\t" - "vfmaxsb %%v21,%%v21,%%v29,0\n\t" - "vfmaxsb %%v22,%%v22,%%v30,0\n\t" - "vfmaxsb %%v23,%%v23,%%v31,0\n\t" - "vfmaxsb %%v16,%%v16,%%v20,0\n\t" - "vfmaxsb %%v17,%%v17,%%v21,0\n\t" - "vfmaxsb %%v18,%%v18,%%v22,0\n\t" - "vfmaxsb %%v19,%%v19,%%v23,0\n\t" - "vfmaxsb %%v16,%%v16,%%v18,0\n\t" - "vfmaxsb %%v17,%%v17,%%v19,0\n\t" - "vfmaxsb %%v16,%%v16,%%v17,0\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfmaxsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfmaxsb %%v0,%%v0,%%v16,0\n\t" - "ler %[max],%%f0" - : [max] "=f"(max),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return max; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT maxf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - maxf = smax_kernel_64(n1, x); - - i = n1; - } else { - maxf = x[0]; - i++; - } - - while (i < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i++; - } - return (maxf); - - } else { - - maxf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] > maxf) { - maxf = x[i]; - } - if (x[i + inc_x] > maxf) { - maxf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] > maxf) { - maxf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] > maxf) { - maxf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] > maxf) { - maxf = x[i]; - } - i += inc_x; - j++; - } - return (maxf); - } -} diff --git a/kernel/zarch/smin.c b/kernel/zarch/smin.c deleted file mode 100644 index b6875c5c6..000000000 --- a/kernel/zarch/smin.c +++ /dev/null @@ -1,149 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static FLOAT smin_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT min; - - __asm__("vl %%v0,0(%[x])\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vl %%v24,128(%%r1,%[x])\n\t" - "vl %%v25,144(%%r1,%[x])\n\t" - "vl %%v26,160(%%r1,%[x])\n\t" - "vl %%v27,176(%%r1,%[x])\n\t" - "vl %%v28,192(%%r1,%[x])\n\t" - "vl %%v29,208(%%r1,%[x])\n\t" - "vl %%v30,224(%%r1,%[x])\n\t" - "vl %%v31,240(%%r1,%[x])\n\t" - "vfminsb %%v16,%%v16,%%v24,0\n\t" - "vfminsb %%v17,%%v17,%%v25,0\n\t" - "vfminsb %%v18,%%v18,%%v26,0\n\t" - "vfminsb %%v19,%%v19,%%v27,0\n\t" - "vfminsb %%v20,%%v20,%%v28,0\n\t" - "vfminsb %%v21,%%v21,%%v29,0\n\t" - "vfminsb %%v22,%%v22,%%v30,0\n\t" - "vfminsb %%v23,%%v23,%%v31,0\n\t" - "vfminsb %%v16,%%v16,%%v20,0\n\t" - "vfminsb %%v17,%%v17,%%v21,0\n\t" - "vfminsb %%v18,%%v18,%%v22,0\n\t" - "vfminsb %%v19,%%v19,%%v23,0\n\t" - "vfminsb %%v16,%%v16,%%v18,0\n\t" - "vfminsb %%v17,%%v17,%%v19,0\n\t" - "vfminsb %%v16,%%v16,%%v17,0\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "veslg %%v16,%%v0,32\n\t" - "vfminsb %%v0,%%v0,%%v16,0\n\t" - "vrepf %%v16,%%v0,2\n\t" - "wfminsb %%v0,%%v0,%%v16,0\n\t" - "ler %[min],%%f0" - : [min] "=f"(min),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return min; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT minf = 0.0; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - - minf = smin_kernel_64(n1, x); - - i = n1; - } else { - minf = x[0]; - i++; - } - - while (i < n) { - if (x[i] < minf) { - minf = x[i]; - } - i++; - } - return (minf); - - } else { - - minf = x[0]; - - BLASLONG n1 = n & -4; - while (j < n1) { - - if (x[i] < minf) { - minf = x[i]; - } - if (x[i + inc_x] < minf) { - minf = x[i + inc_x]; - } - if (x[i + 2 * inc_x] < minf) { - minf = x[i + 2 * inc_x]; - } - if (x[i + 3 * inc_x] < minf) { - minf = x[i + 3 * inc_x]; - } - - i += inc_x * 4; - - j += 4; - - } - - while (j < n) { - if (x[i] < minf) { - minf = x[i]; - } - i += inc_x; - j++; - } - return (minf); - } -} diff --git a/kernel/zarch/srot.c b/kernel/zarch/srot.c deleted file mode 100644 index 4f471d866..000000000 --- a/kernel/zarch/srot.c +++ /dev/null @@ -1,226 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void srot_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__("vlrepf %%v0,%[c]\n\t" - "vlrepf %%v1,%[s]\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmsb %%v28,%%v24,%%v0\n\t" - "vfmsb %%v29,%%v25,%%v0\n\t" - "vfmsb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v30,%%v26,%%v0\n\t" - "vfmsb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmsb %%v31,%%v27,%%v0\n\t" - "vfmsb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmasb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmssb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmasb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmssb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmasb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmssb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmasb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmssb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), - [n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT c, FLOAT s) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - - FLOAT temp; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - FLOAT cosa, sina; - cosa = c; - sina = s; - srot_kernel_64(n1, x, y, &cosa, &sina); - i = n1; - } - - while (i < n) { - temp = c * x[i] + s * y[i]; - y[i] = c * y[i] - s * x[i]; - x[i] = temp; - - i++; - - } - - } else { - - while (i < n) { - temp = c * x[ix] + s * y[iy]; - y[iy] = c * y[iy] - s * x[ix]; - x[ix] = temp; - - ix += inc_x; - iy += inc_y; - i++; - - } - - } - return (0); - -} diff --git a/kernel/zarch/sscal.c b/kernel/zarch/sscal.c deleted file mode 100644 index 9b9930dc8..000000000 --- a/kernel/zarch/sscal.c +++ /dev/null @@ -1,173 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void sscal_kernel_32(BLASLONG n, FLOAT da, FLOAT *x) { - __asm__("vlrepf %%v0,%[da]\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v24,0(%%r1,%[x])\n\t" - "vfmsb %%v24,%%v24,%%v0\n\t" - "vst %%v24,0(%%r1,%[x])\n\t" - "vl %%v25,16(%%r1,%[x])\n\t" - "vfmsb %%v25,%%v25,%%v0\n\t" - "vst %%v25,16(%%r1,%[x])\n\t" - "vl %%v26,32(%%r1,%[x])\n\t" - "vfmsb %%v26,%%v26,%%v0\n\t" - "vst %%v26,32(%%r1,%[x])\n\t" - "vl %%v27,48(%%r1,%[x])\n\t" - "vfmsb %%v27,%%v27,%%v0\n\t" - "vst %%v27,48(%%r1,%[x])\n\t" - "vl %%v28,64(%%r1,%[x])\n\t" - "vfmsb %%v28,%%v28,%%v0\n\t" - "vst %%v28,64(%%r1,%[x])\n\t" - "vl %%v29,80(%%r1,%[x])\n\t" - "vfmsb %%v29,%%v29,%%v0\n\t" - "vst %%v29,80(%%r1,%[x])\n\t" - "vl %%v30,96(%%r1,%[x])\n\t" - "vfmsb %%v30,%%v30,%%v0\n\t" - "vst %%v30,96(%%r1,%[x])\n\t" - "vl %%v31,112(%%r1,%[x])\n\t" - "vfmsb %%v31,%%v31,%%v0\n\t" - "vst %%v31,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) - : [x] "a"(x),[da] "Q"(da) - : "cc", "r1", "v0", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} - -static void sscal_kernel_32_zero(BLASLONG n, FLOAT *x) { - __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],5\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n]; } *) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, - BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - if (n <= 0 || inc_x <= 0) - return (0); - - if (inc_x == 1) { - - if (da == 0.0) { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - - sscal_kernel_32_zero(n1, x); - j = n1; - } - - while (j < n) { - - x[j] = 0.0; - j++; - } - - } else { - - BLASLONG n1 = n & -32; - if (n1 > 0) { - sscal_kernel_32(n1, da, x); - j = n1; - } - while (j < n) { - - x[j] = da * x[j]; - j++; - } - } - - } else { - - if (da == 0.0) { - - BLASLONG n1 = n & -2; - - while (j < n1) { - - x[i] = 0.0; - x[i + inc_x] = 0.0; - - i += inc_x * 2; - j += 2; - - } - while (j < n) { - - x[i] = 0.0; - i += inc_x; - j++; - } - - } else { - BLASLONG n1 = n & -2; - - while (j < n1) { - - x[i] = da * x[i]; - x[i + inc_x] = da * x[i + inc_x]; - - i += inc_x * 2; - j += 2; - - } - - while (j < n) { - - x[i] = da * x[i]; - i += inc_x; - j++; - } - } - - } - return 0; - -} diff --git a/kernel/zarch/ssum.c b/kernel/zarch/ssum.c deleted file mode 100644 index a433ab592..000000000 --- a/kernel/zarch/ssum.c +++ /dev/null @@ -1,151 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - - -static FLOAT ssum_kernel_64(BLASLONG n, FLOAT *x) { - FLOAT sum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vfasb %%v24,%%v24,%%v16\n\t" - "vfasb %%v25,%%v25,%%v17\n\t" - "vfasb %%v26,%%v26,%%v18\n\t" - "vfasb %%v27,%%v27,%%v19\n\t" - "vfasb %%v28,%%v28,%%v20\n\t" - "vfasb %%v29,%%v29,%%v21\n\t" - "vfasb %%v30,%%v30,%%v22\n\t" - "vfasb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vfasb %%v24,%%v24,%%v26\n\t" - "vfasb %%v24,%%v24,%%v27\n\t" - "vfasb %%v24,%%v24,%%v28\n\t" - "vfasb %%v24,%%v24,%%v29\n\t" - "vfasb %%v24,%%v24,%%v30\n\t" - "vfasb %%v24,%%v24,%%v31\n\t" - "veslg %%v25,%%v24,32\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vrepf %%v25,%%v24,2\n\t" - "vfasb %%v24,%%v24,%%v25\n\t" - "vstef %%v24,%[asum],0" - : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return sum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG j = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - - if (n <= 0 || inc_x <= 0) - return sumf; - - if (inc_x == 1) { - - n1 = n & -64; - - if (n1 > 0) { - - sumf = ssum_kernel_64(n1, x); - i = n1; - } - - while (i < n) { - sumf += x[i]; - i++; - } - - } else { - BLASLONG n1 = n & -4; - register FLOAT sum1, sum2; - sum1 = 0.0; - sum2 = 0.0; - while (j < n1) { - - sum1 += x[i]; - sum2 += x[i + inc_x]; - sum1 += x[i + 2 * inc_x]; - sum2 += x[i + 3 * inc_x]; - - i += inc_x * 4; - j += 4; - - } - sumf = sum1 + sum2; - while (j < n) { - - sumf += x[i]; - i += inc_x; - j++; - } - - } - return sumf; -} diff --git a/kernel/zarch/sswap.c b/kernel/zarch/sswap.c deleted file mode 100644 index 0c62f189d..000000000 --- a/kernel/zarch/sswap.c +++ /dev/null @@ -1,151 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -static void sswap_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],6\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n]; } *) x), "+m"(*(struct { FLOAT x[n]; } *) y), - [n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); -} - -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, - BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -64; - if (n1 > 0) { - sswap_kernel_64(n1, x, y); - i = n1; - } - - while (i < n) { - temp = y[i]; - y[i] = x[i]; - x[i] = temp; - i++; - - } - - } else { - - while (i < n) { - temp = y[iy]; - y[iy] = x[ix]; - x[ix] = temp; - ix += inc_x; - iy += inc_y; - i++; - - } - - } - return (0); - -} diff --git a/kernel/zarch/zamax.c b/kernel/zarch/zamax.c deleted file mode 100644 index aa04ab91f..000000000 --- a/kernel/zarch/zamax.c +++ /dev/null @@ -1,192 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmaxdb %%v16,%%v16,%%v24,0\n\t" - "vfmaxdb %%v18,%%v18,%%v26,0\n\t" - "vfmaxdb %%v20,%%v20,%%v28,0\n\t" - "vfmaxdb %%v22,%%v22,%%v30,0\n\t" - "vfmaxdb %%v16,%%v16,%%v20,0\n\t" - "vfmaxdb %%v18,%%v18,%%v22,0\n\t" - "vfmaxdb %%v16,%%v16,%%v18,0\n\t" - "vfmaxdb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmaxdb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } else { - maxf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (maxf); - - } else { - - maxf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) > maxf) { - maxf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (maxf); - } -} diff --git a/kernel/zarch/zamax_z13.c b/kernel/zarch/zamax_z13.c deleted file mode 100644 index 37278d6db..000000000 --- a/kernel/zarch/zamax_z13.c +++ /dev/null @@ -1,201 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static FLOAT zamax_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT amax; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v16,%%v17\n\t" - "vfchdb %%v25,%%v18,%%v19\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v24,%%v25\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v26,%%v0\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v0,%%v16\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amax],%%f0" - : [amax] "=f"(amax),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); - - return amax; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT maxf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (maxf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - maxf = zamax_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } else { - maxf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (maxf); - - } else { - - maxf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) > maxf) { - maxf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) > maxf) { - maxf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) > maxf) { - maxf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (maxf); - } -} diff --git a/kernel/zarch/zamin.c b/kernel/zarch/zamin.c deleted file mode 100644 index 0b5402853..000000000 --- a/kernel/zarch/zamin.c +++ /dev/null @@ -1,192 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vleg %%v24,128(%%r1,%[x]),0\n\t" - "vleg %%v25,136(%%r1,%[x]),0\n\t" - "vleg %%v24,144(%%r1,%[x]),1\n\t" - "vleg %%v25,152(%%r1,%[x]),1\n\t" - "vleg %%v26,160(%%r1,%[x]),0\n\t" - "vleg %%v27,168(%%r1,%[x]),0\n\t" - "vleg %%v26,176(%%r1,%[x]),1\n\t" - "vleg %%v27,184(%%r1,%[x]),1\n\t" - "vleg %%v28,192(%%r1,%[x]),0\n\t" - "vleg %%v29,200(%%r1,%[x]),0\n\t" - "vleg %%v28,208(%%r1,%[x]),1\n\t" - "vleg %%v29,216(%%r1,%[x]),1\n\t" - "vleg %%v30,224(%%r1,%[x]),0\n\t" - "vleg %%v31,232(%%r1,%[x]),0\n\t" - "vleg %%v30,240(%%r1,%[x]),1\n\t" - "vleg %%v31,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16,%%v16\n\t" - "vflpdb %%v17,%%v17\n\t" - "vflpdb %%v18,%%v18\n\t" - "vflpdb %%v19,%%v19\n\t" - "vflpdb %%v20,%%v20\n\t" - "vflpdb %%v21,%%v21\n\t" - "vflpdb %%v22,%%v22\n\t" - "vflpdb %%v23,%%v23\n\t" - "vflpdb %%v24,%%v24\n\t" - "vflpdb %%v25,%%v25\n\t" - "vflpdb %%v26,%%v26\n\t" - "vflpdb %%v27,%%v27\n\t" - "vflpdb %%v28,%%v28\n\t" - "vflpdb %%v29,%%v29\n\t" - "vflpdb %%v30,%%v30\n\t" - "vflpdb %%v31,%%v31\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v18,%%v18,%%v19\n\t" - "vfadb %%v20,%%v20,%%v21\n\t" - "vfadb %%v22,%%v22,%%v23\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v26,%%v26,%%v27\n\t" - "vfadb %%v28,%%v28,%%v29\n\t" - "vfadb %%v30,%%v30,%%v31\n\t" - "vfmindb %%v16,%%v16,%%v24,0\n\t" - "vfmindb %%v18,%%v18,%%v26,0\n\t" - "vfmindb %%v20,%%v20,%%v28,0\n\t" - "vfmindb %%v22,%%v22,%%v30,0\n\t" - "vfmindb %%v16,%%v16,%%v20,0\n\t" - "vfmindb %%v18,%%v18,%%v22,0\n\t" - "vfmindb %%v16,%%v16,%%v18,0\n\t" - "vfmindb %%v0,%%v0,%%v16,0\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfmindb %%v0,%%v0,%%v16,0\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } else { - minf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (minf); - - } else { - - minf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) < minf) { - minf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) < minf) { - minf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) < minf) { - minf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (minf); - } -} diff --git a/kernel/zarch/zamin_z13.c b/kernel/zarch/zamin_z13.c deleted file mode 100644 index e37bb2236..000000000 --- a/kernel/zarch/zamin_z13.c +++ /dev/null @@ -1,201 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#define CABS1(x,i) (fabs(x[i]) + fabs(x[i + 1])) - -static FLOAT zamin_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT amin; - - __asm__("vleg %%v0,0(%[x]),0\n\t" - "vleg %%v16,8(%[x]),0\n\t" - "vleg %%v0,16(%[x]),1\n\t" - "vleg %%v16,24(%[x]),1\n\t" - "vflpdb %%v0,%%v0\n\t" - "vflpdb %%v16,%%v16\n\t" - "vfadb %%v0,%%v0,%%v16\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vleg %%v16,0(%%r1,%[x]),0\n\t" - "vleg %%v17,8(%%r1,%[x]),0\n\t" - "vleg %%v16,16(%%r1,%[x]),1\n\t" - "vleg %%v17,24(%%r1,%[x]),1\n\t" - "vleg %%v18,32(%%r1,%[x]),0\n\t" - "vleg %%v19,40(%%r1,%[x]),0\n\t" - "vleg %%v18,48(%%r1,%[x]),1\n\t" - "vleg %%v19,56(%%r1,%[x]),1\n\t" - "vleg %%v20,64(%%r1,%[x]),0\n\t" - "vleg %%v21,72(%%r1,%[x]),0\n\t" - "vleg %%v20,80(%%r1,%[x]),1\n\t" - "vleg %%v21,88(%%r1,%[x]),1\n\t" - "vleg %%v22,96(%%r1,%[x]),0\n\t" - "vleg %%v23,104(%%r1,%[x]),0\n\t" - "vleg %%v22,112(%%r1,%[x]),1\n\t" - "vleg %%v23,120(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "vleg %%v16,128(%%r1,%[x]),0\n\t" - "vleg %%v17,136(%%r1,%[x]),0\n\t" - "vleg %%v16,144(%%r1,%[x]),1\n\t" - "vleg %%v17,152(%%r1,%[x]),1\n\t" - "vleg %%v18,160(%%r1,%[x]),0\n\t" - "vleg %%v19,168(%%r1,%[x]),0\n\t" - "vleg %%v18,176(%%r1,%[x]),1\n\t" - "vleg %%v19,184(%%r1,%[x]),1\n\t" - "vleg %%v20,192(%%r1,%[x]),0\n\t" - "vleg %%v21,200(%%r1,%[x]),0\n\t" - "vleg %%v20,208(%%r1,%[x]),1\n\t" - "vleg %%v21,216(%%r1,%[x]),1\n\t" - "vleg %%v22,224(%%r1,%[x]),0\n\t" - "vleg %%v23,232(%%r1,%[x]),0\n\t" - "vleg %%v22,240(%%r1,%[x]),1\n\t" - "vleg %%v23,248(%%r1,%[x]),1\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vfadb %%v17,%%v18,%%v19\n\t" - "vfadb %%v18,%%v20,%%v21\n\t" - "vfadb %%v19,%%v22,%%v23\n\t" - "vfchdb %%v24,%%v17,%%v16\n\t" - "vfchdb %%v25,%%v19,%%v18\n\t" - "vsel %%v24,%%v16,%%v17,%%v24\n\t" - "vsel %%v25,%%v18,%%v19,%%v25\n\t" - "vfchdb %%v26,%%v25,%%v24\n\t" - "vsel %%v26,%%v24,%%v25,%%v26\n\t" - "vfchdb %%v27,%%v0,%%v26\n\t" - "vsel %%v0,%%v26,%%v0,%%v27\n\t" - "agfi %%r1, 256\n\t" - "brctg %[n], 0b\n\t" - "vrepg %%v16,%%v0,1\n\t" - "wfchdb %%v17,%%v16,%%v0\n\t" - "vsel %%v0,%%v0,%%v16,%%v17\n\t" - "ldr %[amin],%%f0" - : [amin] "=f"(amin),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23", "v24", "v25", "v26", "v27"); - - return amin; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ix = 0; - FLOAT minf = 0.0; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (minf); - - if (inc_x == 1) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - - minf = zamin_kernel_16(n1, x); - ix = n1 * 2; - i = n1; - } else { - minf = CABS1(x, 0); - ix += 2; - i++; - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += 2; - i++; - } - return (minf); - - } else { - - minf = CABS1(x, 0); - inc_x2 = 2 * inc_x; - - BLASLONG n1 = n & -4; - while (i < n1) { - - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - if (CABS1(x, ix + inc_x2) < minf) { - minf = CABS1(x, ix + inc_x2); - } - if (CABS1(x, ix + inc_x2 * 2) < minf) { - minf = CABS1(x, ix + inc_x2 * 2); - } - if (CABS1(x, ix + inc_x2 * 3) < minf) { - minf = CABS1(x, ix + inc_x2 * 3); - } - - ix += inc_x2 * 4; - - i += 4; - - } - - while (i < n) { - if (CABS1(x, ix) < minf) { - minf = CABS1(x, ix); - } - ix += inc_x2; - i++; - } - return (minf); - } -} diff --git a/kernel/zarch/zasum.c b/kernel/zarch/zasum.c index aeef8d77e..0fc5c9ecb 100644 --- a/kernel/zarch/zasum.c +++ b/kernel/zarch/zasum.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,129 +25,135 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" #include +#if defined(DOUBLE) + #define ABS fabs +#else + +#define ABS fabsf + +#endif + + static FLOAT zasum_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT asum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vflpdb %%v16, %%v16\n\t" - "vflpdb %%v17, %%v17\n\t" - "vflpdb %%v18, %%v18\n\t" - "vflpdb %%v19, %%v19\n\t" - "vflpdb %%v20, %%v20\n\t" - "vflpdb %%v21, %%v21\n\t" - "vflpdb %%v22, %%v22\n\t" - "vflpdb %%v23, %%v23\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [asum] "=Q"(asum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return asum; + + FLOAT asum; + __asm__ ( + "pfd 1, 0(%[ptr_x]) \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[ptr_x] \n\t" + "vzero %%v0 \n\t" + "vzero %%v1 \n\t" + "vzero %%v22 \n\t" + "vzero %%v23 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%[ptr_tmp] ) \n\t" + "vlm %%v24,%%v31,0(%[ptr_tmp]) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" + + "vlm %%v24,%%v31, 128(%[ptr_tmp]) \n\t" + + "vflpdb %%v24, %%v24 \n\t" + "vflpdb %%v25, %%v25 \n\t" + "vflpdb %%v26, %%v26 \n\t" + "vflpdb %%v27, %%v27 \n\t" + "vflpdb %%v28, %%v28 \n\t" + "vflpdb %%v29, %%v29 \n\t" + "vflpdb %%v30, %%v30 \n\t" + "vflpdb %%v31, %%v31 \n\t" + "la %[ptr_tmp],256(%[ptr_tmp]) \n\t" + "vfadb %%v0,%%v0,%%v24 \n\t" + "vfadb %%v1,%%v1,%%v25 \n\t" + "vfadb %%v23,%%v23,%%v26 \n\t" + "vfadb %%v22,%%v22,%%v27 \n\t" + "vfadb %%v0,%%v0,%%v28 \n\t" + "vfadb %%v1,%%v1,%%v29 \n\t" + "vfadb %%v23,%%v23,%%v30 \n\t" + "vfadb %%v22,%%v22,%%v31 \n\t" + + "clgrjl %[ptr_tmp],%%r0,1b \n\t" + "vfadb %%v24,%%v0,%%v1 \n\t" + "vfadb %%v25,%%v23,%%v22 \n\t" + "vfadb %%v0,%%v25,%%v24 \n\t" + "vrepg %%v1,%%v0,1 \n\t" + "adbr %%f0,%%f1 \n\t" + "ldr %[asum] ,%%f0" + : [asum] "=f"(asum),[ptr_tmp] "+&a"(x) + : [mem] "m"( *(const double (*)[2*n])x ), [n] "r"(n), [ptr_x] "a"(x) + : "cc", "r0","f0","f1","v0","v1","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return asum; + } -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ip = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; + - if (n <= 0 || inc_x <= 0) - return (sumf); +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; + BLASLONG n1; + BLASLONG inc_x2; - if (inc_x == 1) { + if (n <= 0 || inc_x <= 0) return(sumf); - n1 = n & -16; - if (n1 > 0) { + if ( inc_x == 1 ) + { - sumf = zasum_kernel_16(n1, x); - i = n1; - ip = 2 * n1; - } + n1 = n & -16; + if ( n1 > 0 ) + { - while (i < n) { - sumf += ABS(x[ip]) + ABS(x[ip + 1]); - i++; - ip += 2; - } + sumf=zasum_kernel_16(n1, x ); + i=n1; + ip=2*n1; + } - } else { - inc_x2 = 2 * inc_x; + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + i++; + ip+=2; + } - while (i < n) { - sumf += ABS(x[ip]) + ABS(x[ip + 1]); - ip += inc_x2; - i++; } + else + { + inc_x2 = 2* inc_x; + + while(i < n) + { + sumf += ABS(x[ip]) + ABS(x[ip+1]); + ip+=inc_x2; + i++; + } - } - return (sumf); + } + return(sumf); } + + diff --git a/kernel/zarch/zaxpy.c b/kernel/zarch/zaxpy.c index 9363ec32d..212de25c8 100644 --- a/kernel/zarch/zaxpy.c +++ b/kernel/zarch/zaxpy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,140 +23,190 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ + #include "common.h" -static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) { - __asm__( + +static void zaxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) { + + BLASLONG tempR1 ; + __asm__ ("pfd 1, 0(%[x_tmp]) \n\t" + "pfd 2, 0(%[y_tmp]) \n\t" #if !defined(CONJ) - "vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v29,%%v29 \n\t" //complement both + "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + #else - "vleg %%v0,0(%[alpha]),1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,0(%[alpha]),0\n\t" - "vlrepg %%v1,8(%[alpha])\n\t" -#endif - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v8,0(%%r1,%[x])\n\t" - "vl %%v9,16(%%r1,%[x])\n\t" - "vl %%v10,32(%%r1,%[x])\n\t" - "vl %%v11,48(%%r1,%[x])\n\t" - "vl %%v12,0(%%r1,%[y])\n\t" - "vl %%v13,16(%%r1,%[y])\n\t" - "vl %%v14,32(%%r1,%[y])\n\t" - "vl %%v15,48(%%r1,%[y])\n\t" - "vl %%v16,64(%%r1,%[x])\n\t" - "vl %%v17,80(%%r1,%[x])\n\t" - "vl %%v18,96(%%r1,%[x])\n\t" - "vl %%v19,112(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[y])\n\t" - "vl %%v21,80(%%r1,%[y])\n\t" - "vl %%v22,96(%%r1,%[y])\n\t" - "vl %%v23,112(%%r1,%[y])\n\t" - "vpdi %%v24,%%v8,%%v8,4\n\t" - "vpdi %%v25,%%v9,%%v9,4\n\t" - "vpdi %%v26,%%v10,%%v10,4\n\t" - "vpdi %%v27,%%v11,%%v11,4\n\t" - "vpdi %%v28,%%v16,%%v16,4\n\t" - "vpdi %%v29,%%v17,%%v17,4\n\t" - "vpdi %%v30,%%v18,%%v18,4\n\t" - "vpdi %%v31,%%v19,%%v19,4\n\t" - "vfmadb %%v8,%%v8,%%v0,%%v12\n\t" - "vfmadb %%v9,%%v9,%%v0,%%v13\n\t" - "vfmadb %%v10,%%v10,%%v0,%%v14\n\t" - "vfmadb %%v11,%%v11,%%v0,%%v15\n\t" - "vfmadb %%v16,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v17,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v18,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v19,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v8,%%v24,%%v1,%%v8\n\t" - "vfmadb %%v9,%%v25,%%v1,%%v9\n\t" - "vfmadb %%v10,%%v26,%%v1,%%v10\n\t" - "vfmadb %%v11,%%v27,%%v1,%%v11\n\t" - "vfmadb %%v16,%%v28,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v29,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v30,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v31,%%v1,%%v19\n\t" - "vst %%v8,0(%%r1,%[y])\n\t" - "vst %%v9,16(%%r1,%[y])\n\t" - "vst %%v10,32(%%r1,%[y])\n\t" - "vst %%v11,48(%%r1,%[y])\n\t" - "vst %%v16,64(%%r1,%[y])\n\t" - "vst %%v17,80(%%r1,%[y])\n\t" - "vst %%v18,96(%%r1,%[y])\n\t" - "vst %%v19,112(%%r1,%[y])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v8", "v9", "v10", "v11", "v12", "v13", - "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v28,%%v28 \n\t" //complement both + "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} +#endif + + "xgr %[t1],%[t1] \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "vl %%v30 , 0(%[t1],%[y_tmp]) \n\t" + "vl %%v31 , 16(%[t1],%[y_tmp]) \n\t" + "vl %%v6 , 32(%[t1],%[y_tmp]) \n\t" + "vl %%v7 , 48(%[t1],%[y_tmp]) \n\t" + "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" + "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" + "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition + "j 2f \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" + "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" + "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" + "vl %%v30, 64(%[t1],%[y_tmp]) \n\t" + "vl %%v31, 80(%[t1],%[y_tmp]) \n\t" + "vl %%v6 , 96(%[t1],%[y_tmp]) \n\t" + "vl %%v7 , 112(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + + "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + "2: \n\t" + "pfd 1, 256(%[t1],%[x_tmp]) \n\t" + "pfd 2, 256(%[t1],%[y_tmp]) \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + + "vfmadb %%v30, %%v20, %%v28, %%v30 \n\t" + "vfmadb %%v31, %%v21, %%v28, %%v31 \n\t" + "vfmadb %%v6, %%v22, %%v28, %%v6 \n\t" + "vfmadb %%v7, %%v23, %%v28, %%v7 \n\t" + "vl %%v16, 64(%[t1],%[y_tmp]) \n\t" + "vl %%v17, 80(%[t1],%[y_tmp]) \n\t" + "vl %%v18, 96(%[t1],%[y_tmp]) \n\t" + "vl %%v19, 112(%[t1],%[y_tmp]) \n\t" + "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" + "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" + "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" + "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" + + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + + "vst %%v30 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v31 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v6 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v7 , 48(%[t1],%[y_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + + + "clgrjl %[t1],%[tmp],1b \n\t" +//---------------------------------------------------------------------- + "vfmadb %%v16, %%v20, %%v28, %%v16 \n\t" + "vfmadb %%v17, %%v21, %%v28, %%v17 \n\t" + "vfmadb %%v18, %%v22, %%v28, %%v18 \n\t" + "vfmadb %%v19, %%v23, %%v28, %%v19 \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + + "vst %%v16 , 0(%[t1],%[y_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[y_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[y_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[y_tmp]) \n\t" + + : [mem_y] "+m" (*(double (*)[2*n])y),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) + : [mem_x] "m" (*(const double (*)[2*n])x), [x_tmp] "a"(x), [y_tmp] "a"(y), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) + : "cc", "v6","v7", "v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, - FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT da[2] __attribute__ ((aligned(16))); - if (n <= 0) - return (0); +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0; + BLASLONG ix = 0, iy = 0; - if ((inc_x == 1) && (inc_y == 1)) { + if (n <= 0) return (0); - BLASLONG n1 = n & -8; + if ((inc_x == 1) && (inc_y == 1)) { - if (n1) { - da[0] = da_r; - da[1] = da_i; - zaxpy_kernel_8(n1, x, y, da); - ix = 2 * n1; - } - i = n1; - while (i < n) { + BLASLONG n1 = n & -8; + + if (n1) { + zaxpy_kernel_8(n1, x, y, da_r,da_i); + ix = 2 * n1; + } + i = n1; + while (i < n) { #if !defined(CONJ) - y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); - y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); + y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); - y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); + y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - i++; - ix += 2; + i++; + ix += 2; + + } + return (0); - } - return (0); - } + } - inc_x *= 2; - inc_y *= 2; + inc_x *= 2; + inc_y *= 2; - while (i < n) { + while (i < n) { #if !defined(CONJ) - y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); - y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); + y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); + y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); #else - y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); - y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); + y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); + y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); #endif - ix += inc_x; - iy += inc_y; - i++; + ix += inc_x; + iy += inc_y; + i++; - } - return (0); + } + return (0); } + + diff --git a/kernel/zarch/zcopy.c b/kernel/zarch/zcopy.c index 5a46aec1c..b5bf383f7 100644 --- a/kernel/zarch/zcopy.c +++ b/kernel/zarch/zcopy.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -24,65 +24,122 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ - + #include "common.h" + +static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { + + __asm__ volatile( + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" + + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" + + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_y] "=m" (*(double (*)[2*n])y), [n_tmp] "+&r"(n) + : [mem_x] "m" (*(const double (*)[2*n])x), [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; -static void zcopy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],4\n\t" - "0:\n\t" - "pfd 1, 1024(%[x])\n\t" - "pfd 2, 1024(%[y])\n\t" - "mvc 0(256,%[y]),0(%[x])\n\t" - "la %[x],256(%[x])\n\t" - "la %[y],256(%[y])\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) y),[x] "+&a"(x),[y] "+&a"(y), - [n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x) - : "cc"); } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - if (n <= 0) - return (0); +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; - if ((inc_x == 1) && (inc_y == 1)) { + if ( n <= 0 ) return(0); - BLASLONG n1 = n & -16; - if (n1 > 0) { - zcopy_kernel_16(n1, x, y); - i = n1; - ix = n1 * 2; - iy = n1 * 2; - } + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + zcopy_kernel_16(n1, x, y); + i=n1; + ix=n1*2; + iy=n1*2; + } + + while(i < n) + { + y[iy] = x[iy] ; + y[iy+1] = x[ix+1] ; + ix+=2; + iy+=2; + i++ ; + + } - while (i < n) { - y[iy] = x[iy]; - y[iy + 1] = x[ix + 1]; - ix += 2; - iy += 2; - i++; } + else + { - } else { + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_y2 = 2 * inc_y; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_y2 = 2 * inc_y; + while(i < n) + { + y[iy] = x[ix] ; + y[iy+1] = x[ix+1] ; + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; - while (i < n) { - y[iy] = x[ix]; - y[iy + 1] = x[ix + 1]; - ix += inc_x2; - iy += inc_y2; - i++; + } } + return(0); + - } - - return (0); } + + diff --git a/kernel/zarch/zdot.c b/kernel/zarch/zdot.c index ac6e69c23..61c5d6b98 100644 --- a/kernel/zarch/zdot.c +++ b/kernel/zarch/zdot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,150 +23,203 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ + #include "common.h" +#if defined(Z13) static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "pfd 1, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "vl %%v16, 64(%%r1,%[x])\n\t" - "vl %%v17, 80(%%r1,%[x])\n\t" - "vl %%v18, 96(%%r1,%[x])\n\t" - "vl %%v19, 112(%%r1,%[x])\n\t" - "vl %%v0, 64(%%r1,%[y])\n\t" - "vl %%v1, 80(%%r1,%[y])\n\t" - "vl %%v2, 96(%%r1,%[y])\n\t" - "vl %%v3, 112(%%r1,%[y])\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" - "vfmadb %%v24,%%v16,%%v0,%%v24\n\t" - "vfmadb %%v25,%%v20,%%v0,%%v25\n\t" - "vfmadb %%v26,%%v17,%%v1,%%v26\n\t" - "vfmadb %%v27,%%v21,%%v1,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v2,%%v28\n\t" - "vfmadb %%v29,%%v22,%%v2,%%v29\n\t" - "vfmadb %%v30,%%v19,%%v3,%%v30\n\t" - "vfmadb %%v31,%%v23,%%v3,%%v31\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v25,%%v25,%%v27\n\t" - "vfadb %%v25,%%v25,%%v29\n\t" - "vfadb %%v25,%%v25,%%v31\n\t" - "vsteg %%v24,0(%[d]),0\n\t" - "vsteg %%v24,8(%[d]),1\n\t" - "vsteg %%v25,16(%[d]),1\n\t" - "vsteg %%v25,24(%[d]),0" - : "=m"(*(struct { FLOAT x[4]; } *) d),[n] "+&r"(n) - : [d] "a"(d), "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[n * 2]; } *) y),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", - "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + + __asm__ volatile( + "pfd 1, 0(%[ptr_x_tmp]) \n\t" + "pfd 1, 0(%[ptr_y_tmp]) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "srlg %[n_tmp],%[n_tmp],3 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 1, 256(%%r1,%[ptr_x_tmp]) \n\t" + "pfd 1, 256(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v28, 0(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v29, 16(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v30, 32(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v31, 48(%%r1,%[ptr_y_tmp]) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + + + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + + + + "vl %%v16, 64(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v19,112(%%r1,%[ptr_x_tmp]) \n\t" + "vl %%v28, 64(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v29, 80(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v30, 96(%%r1,%[ptr_y_tmp]) \n\t" + "vl %%v31,112(%%r1,%[ptr_y_tmp]) \n\t" + "vpdi %%v20,%%v16,%%v16,4 \n\t" + "vpdi %%v21,%%v17,%%v17,4 \n\t" + "vpdi %%v22,%%v18,%%v18,4 \n\t" + "vpdi %%v23,%%v19,%%v19,4 \n\t" + "vfmadb %%v24,%%v16,%%v28,%%v24 \n\t" + "vfmadb %%v25,%%v20,%%v28,%%v25 \n\t" + "vfmadb %%v26,%%v17,%%v29,%%v26 \n\t" + "vfmadb %%v27,%%v21,%%v29,%%v27 \n\t" + "vfmadb %%v24,%%v18,%%v30,%%v24 \n\t" + "vfmadb %%v25,%%v22,%%v30,%%v25 \n\t" + "vfmadb %%v26,%%v19,%%v31,%%v26 \n\t" + "vfmadb %%v27,%%v23,%%v31,%%v27 \n\t" + + + "la %%r1,128(%%r1) \n\t" + "brctg %[n_tmp],1b \n\t" + "vfadb %%v24,%%v26,%%v24 \n\t" + "vfadb %%v25,%%v25,%%v27 \n\t" + "vsteg %%v24, 0(%[ptr_d]),0 \n\t" + "vsteg %%v24, 8(%[ptr_d]),1 \n\t" + "vsteg %%v25,16(%[ptr_d]),1 \n\t" + "vsteg %%v25,24(%[ptr_d]),0 \n\t" + : [mem_out] "=m"(*(double (*)[4])d ) ,[n_tmp] "+&r"(n) + : [mem_x] "m"( *(const double (*)[2*n])x), + [mem_y] "m"( *(const double (*)[2*n])y), + [ptr_x_tmp] "a"(x), [ptr_y_tmp] "a"(y), [ptr_d] "a"(d) + : "cc", "r1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + } -OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y) { - BLASLONG i; - BLASLONG ix, iy; - OPENBLAS_COMPLEX_FLOAT result; - FLOAT dot[4] __attribute__ ((aligned(16))) = { - 0.0, 0.0, 0.0, 0.0}; - - if (n <= 0) { - CREAL(result) = 0.0; - CIMAG(result) = 0.0; - return (result); +#else - } +static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) { + BLASLONG register i = 0; + FLOAT dot[4] = {0.0, 0.0, 0.0, 0.0}; + BLASLONG j = 0; - if ((inc_x == 1) && (inc_y == 1)) { + while (i < n) { - BLASLONG n1 = n & -8; + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; - if (n1) - zdot_kernel_8(n1, x, y, dot); + dot[0] += x[j + 2] * y[j + 2]; + dot[1] += x[j + 3] * y[j + 3]; + dot[2] += x[j + 2] * y[j + 3]; + dot[3] += x[j + 3] * y[j + 2]; - i = n1; - BLASLONG j = i * 2; + dot[0] += x[j + 4] * y[j + 4]; + dot[1] += x[j + 5] * y[j + 5]; + dot[2] += x[j + 4] * y[j + 5]; + dot[3] += x[j + 5] * y[j + 4]; - while (i < n) { + dot[0] += x[j + 6] * y[j + 6]; + dot[1] += x[j + 7] * y[j + 7]; + dot[2] += x[j + 6] * y[j + 7]; + dot[3] += x[j + 7] * y[j + 6]; + + j += 8; + i += 4; + + } + d[0] = dot[0]; + d[1] = dot[1]; + d[2] = dot[2]; + d[3] = dot[3]; + +} + +#endif - dot[0] += x[j] * y[j]; - dot[1] += x[j + 1] * y[j + 1]; - dot[2] += x[j] * y[j + 1]; - dot[3] += x[j + 1] * y[j]; +OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { + BLASLONG i = 0; + BLASLONG ix=0, iy=0; + OPENBLAS_COMPLEX_FLOAT result; + FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; - j += 2; - i++; + if (n <= 0) { + CREAL(result) = 0.0; + CIMAG(result) = 0.0; + return (result); } - } else { - i = 0; - ix = 0; - iy = 0; - inc_x <<= 1; - inc_y <<= 1; - while (i < n) { + if ((inc_x == 1) && (inc_y == 1)) { + + BLASLONG n1 = n & -8; + BLASLONG j=0; + + if (n1){ + zdot_kernel_8(n1, x, y, dot); + i = n1; + j = n1 <<1; + } + + + while (i < n) { + + dot[0] += x[j] * y[j]; + dot[1] += x[j + 1] * y[j + 1]; + dot[2] += x[j] * y[j + 1]; + dot[3] += x[j + 1] * y[j]; + + j += 2; + i++; - dot[0] += x[ix] * y[iy]; - dot[1] += x[ix + 1] * y[iy + 1]; - dot[2] += x[ix] * y[iy + 1]; - dot[3] += x[ix + 1] * y[iy]; + } - ix += inc_x; - iy += inc_y; - i++; + } else { + i = 0; + ix = 0; + iy = 0; + inc_x <<= 1; + inc_y <<= 1; + while (i < n) { + + dot[0] += x[ix] * y[iy]; + dot[1] += x[ix + 1] * y[iy + 1]; + dot[2] += x[ix] * y[iy + 1]; + dot[3] += x[ix + 1] * y[iy]; + + ix += inc_x; + iy += inc_y; + i++; + + } } - } #if !defined(CONJ) - CREAL(result) = dot[0] - dot[1]; - CIMAG(result) = dot[2] + dot[3]; + CREAL(result) = dot[0] - dot[1]; + CIMAG(result) = dot[2] + dot[3]; #else - CREAL(result) = dot[0] + dot[1]; - CIMAG(result) = dot[2] - dot[3]; + CREAL(result) = dot[0] + dot[1]; + CIMAG(result) = dot[2] - dot[3]; #endif - return (result); + return (result); } + + diff --git a/kernel/zarch/zgemv_n_4.c b/kernel/zarch/zgemv_n_4.c index 13045a359..484db3073 100644 --- a/kernel/zarch/zgemv_n_4.c +++ b/kernel/zarch/zgemv_n_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,642 +23,898 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ +#include +#include #include "common.h" +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif + +// #define NBMAX 1024 -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" - "vl %%v18,32(%[x])\n\t" - "vl %%v19,48(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v20,8(%[x]),0\n\t" - "wflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[x]),1\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "wflcdb %%v22,%%v22\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vleg %%v23,56(%[x]),0\n\t" - "wflcdb %%v23,%%v23\n\t" - "vleg %%v23,48(%[x]),1\n\t" + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + #else - "vleg %%v20,0(%[x]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,8(%[x]),0\n\t" - "vleg %%v21,16(%[x]),1\n\t" - "vflcdb %%v21,%%v21\n\t" - "vleg %%v21,24(%[x]),0\n\t" - "vleg %%v22,32(%[x]),1\n\t" - "vflcdb %%v22,%%v22\n\t" - "vleg %%v22,40(%[x]),0\n\t" - "vleg %%v23,48(%[x]),1\n\t" - "vflcdb %%v23,%%v23\n\t" - "vleg %%v23,56(%[x]),0\n\t" + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap0])\n\t" - "vlrepg %%v29,24(%%r1,%[ap0])\n\t" - "vlrepg %%v30,16(%%r1,%[ap1])\n\t" - "vlrepg %%v31,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v24,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v20,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v20,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v21,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v21,%%v1\n\t" - "vlrepg %%v24,0(%%r1,%[ap2])\n\t" - "vlrepg %%v25,8(%%r1,%[ap2])\n\t" - "vlrepg %%v26,0(%%r1,%[ap3])\n\t" - "vlrepg %%v27,8(%%r1,%[ap3])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v0,%%v24,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v28,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v25,%%v22,%%v0\n\t" - "vfmadb %%v1,%%v29,%%v22,%%v1\n\t" - "vfmadb %%v0,%%v26,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v30,%%v19,%%v1\n\t" - "vfmadb %%v0,%%v27,%%v23,%%v0\n\t" - "vfmadb %%v1,%%v31,%%v23,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[8]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + register __vector double va2 = vptr_a2[i]; + register __vector double va2_1 = vptr_a2[i + 1]; + register __vector double va2_2 = vptr_a2[i + 2]; + register __vector double va2_3 = vptr_a2[i + 3]; + + register __vector double va3 = vptr_a3[i]; + register __vector double va3_1 = vptr_a3[i + 1]; + register __vector double va3_2 = vptr_a3[i + 2]; + register __vector double va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_permi(va1, va1, 2); + va1_1 = vec_permi(va1_1, va1_1, 2); + va1_2 = vec_permi(va1_2, va1_2, 2); + va1_3 = vec_permi(va1_3, va1_3, 2); + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_permi(va2, va2, 2); + va2_1 = vec_permi(va2_1, va2_1, 2); + va2_2 = vec_permi(va2_2, va2_2, 2); + va2_3 = vec_permi(va2_3, va2_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_permi(va3, va3, 2); + va3_1 = vec_permi(va3_1, va3_1, 2); + va3_2 = vec_permi(va3_2, va3_2, 2); + va3_3 = vec_permi(va3_3, va3_3, 2); + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + - __asm__("vl %%v16,0(%[x])\n\t" - "vl %%v17,16(%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v18,8(%[x]),0\n\t" - "wflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[x]),1\n\t" - "vleg %%v19,24(%[x]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,16(%[x]),1\n\t" + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + #else - "vleg %%v18,0(%[x]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,8(%[x]),0\n\t" - "vleg %%v19,16(%[x]),1\n\t" - "vflcdb %%v19,%%v19\n\t" - "vleg %%v19,24(%[x]),0\n\t" + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vfmadb %%v0,%%v20,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v24,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v21,%%v18,%%v0\n\t" - "vfmadb %%v1,%%v25,%%v18,%%v1\n\t" - "vfmadb %%v0,%%v22,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v26,%%v17,%%v1\n\t" - "vfmadb %%v0,%%v23,%%v19,%%v0\n\t" - "vfmadb %%v1,%%v27,%%v19,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[4]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27"); + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_permi(va1, va1, 2); + va1_1 = vec_permi(va1_1, va1_1, 2); + va1_2 = vec_permi(va1_2, va1_2, 2); + va1_3 = vec_permi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } } +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { - __asm__("vl %%v16,0(%[x])\n\t" + BLASLONG i; + FLOAT *a0; + a0 = ap; + + #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v17,8(%[x]),0\n\t" - "wflcdb %%v17,%%v17\n\t" - "vleg %%v17,0(%[x]),1\n\t" + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_permi(va0, va0, 2); + va0_1 = vec_permi(va0_1, va0_1, 2); + va0_2 = vec_permi(va0_2, va0_2, 2); + va0_3 = vec_permi(va0_3, va0_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + #else - "vleg %%v17,0(%[x]),1\n\t" - "vflcdb %%v17,%%v17\n\t" - "vleg %%v17,8(%[x]),0\n\t" + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 2,1024(%%r1,%[y])\n\t" - "vl %%v0,0(%%r1,%[y])\n\t" - "vl %%v1,16(%%r1,%[y])\n\t" - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vlrepg %%v20,16(%%r1,%[ap])\n\t" - "vlrepg %%v21,24(%%r1,%[ap])\n\t" - "vfmadb %%v0,%%v18,%%v16,%%v0\n\t" - "vfmadb %%v1,%%v20,%%v16,%%v1\n\t" - "vfmadb %%v0,%%v19,%%v17,%%v0\n\t" - "vfmadb %%v1,%%v21,%%v17,%%v1\n\t" - "vst %%v0,0(%%r1,%[y])\n\t" - "vst %%v1,16(%%r1,%[y])\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[2]; } *) x),[x] "a"(x) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21"); + + } } -static void add_y_4(BLASLONG n, FLOAT *src, FLOAT *dest, FLOAT alpha_r, - FLOAT alpha_i) { - __asm__( -#if !defined(XCONJ) - "vlrepg %%v0,%[alpha_r]\n\t" - "vleg %%v1,%[alpha_i],0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,%[alpha_i],1\n\t" +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + #else - "vleg %%v0,%[alpha_r],1\n\t" - "vflcdb %%v0,%%v0\n\t" - "vleg %%v0,%[alpha_r],0\n\t" - "vlrepg %%v1,%[alpha_i]\n\t" + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; #endif - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],2\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[src])\n\t" - "pfd 2,1024(%%r1,%[dest])\n\t" - "vl %%v16,0(%%r1,%[src])\n\t" - "vl %%v17,16(%%r1,%[src])\n\t" - "vl %%v18,32(%%r1,%[src])\n\t" - "vl %%v19,48(%%r1,%[src])\n\t" - "vl %%v20,0(%%r1,%[dest])\n\t" - "vl %%v21,16(%%r1,%[dest])\n\t" - "vl %%v22,32(%%r1,%[dest])\n\t" - "vl %%v23,48(%%r1,%[dest])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vfmadb %%v28,%%v16,%%v0,%%v20\n\t" - "vfmadb %%v29,%%v17,%%v0,%%v21\n\t" - "vfmadb %%v30,%%v18,%%v0,%%v22\n\t" - "vfmadb %%v31,%%v19,%%v0,%%v23\n\t" - "vfmadb %%v28,%%v24,%%v1,%%v28\n\t" - "vfmadb %%v29,%%v25,%%v1,%%v29\n\t" - "vfmadb %%v30,%%v26,%%v1,%%v30\n\t" - "vfmadb %%v31,%%v27,%%v1,%%v31\n\t" - "vst %%v28,0(%%r1,%[dest])\n\t" - "vst %%v29,16(%%r1,%[dest])\n\t" - "vst %%v30,32(%%r1,%[dest])\n\t" - "vst %%v31,48(%%r1,%[dest])\n\t" - "agfi %%r1,64\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) dest),[n] "+&r"(n) - : [dest] "a"(dest), "m"(*(const struct { FLOAT x[n * 2]; } *) src), - [src] "a"(src),[alpha_r] "Q"(alpha_r),[alpha_i] "Q"(alpha_i) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t=0; + BLASLONG add_dest=inc_dest<<1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0=vptr_y[dest_t]; + register __vector double vy_1=vptr_y1[dest_t]; + register __vector double vy_2=vptr_y2[dest_t]; + register __vector double vy_3=vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_permi(vsrc, vsrc, 2); + vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t+=add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0=vptr_y[i]; + register __vector double vy_1=vptr_y[i+1]; + register __vector double vy_2=vptr_y[i+2]; + register __vector double vy_3=vptr_y[i+3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_permi(vsrc, vsrc, 2); + vsrc_1 = vec_permi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_permi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_permi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; } -static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, - FLOAT alpha_r, FLOAT alpha_i) { - BLASLONG i; +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; - if (inc_dest != 2) { + if (inc_dest != 2) { - FLOAT temp_r; - FLOAT temp_i; - for (i = 0; i < n; i++) { -#if !defined(XCONJ) - temp_r = alpha_r * src[0] - alpha_i * src[1]; - temp_i = alpha_r * src[1] + alpha_i * src[0]; + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; #else - temp_r = alpha_r * src[0] + alpha_i * src[1]; - temp_i = -alpha_r * src[1] + alpha_i * src[0]; + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; #endif - *dest += temp_r; - *(dest + 1) += temp_i; + *dest += temp_r; + *(dest + 1) += temp_i; - src += 2; - dest += inc_dest; + src += 2; + dest += inc_dest; + } + return; } - return; - } - add_y_4(n, src, dest, alpha_r, alpha_i); -} + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[4]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4; - FLOAT xbuffer[8], *ybuffer; - - if (m < 1) - return (0); - if (n < 1) - return (0); - - ybuffer = buffer; - - inc_x *= 2; - inc_y *= 2; - lda *= 2; - lda4 = 4 * lda; - - n1 = n / 4; - n2 = n % 4; - - m3 = m % 4; - m1 = m - (m % 4); - m2 = (m % NBMAX) - (m % 4); - - y_ptr = y; - - BLASLONG NB = NBMAX; - - while (NB == NBMAX) { - - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; } + return; +} +#endif - a_ptr = a; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - x_ptr = x; - //zero_y(NB,ybuffer); - memset(ybuffer, 0, NB * 16); - - if (inc_x == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, ap, x_ptr, ybuffer); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - x_ptr += 8; - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, ap, x_ptr, ybuffer); - x_ptr += 4; - a_ptr += 2 * lda; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); - /* x_ptr += 2; - a_ptr += lda; */ - - } - } else { + int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; - for (i = 0; i < n1; i++) { - - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[2] = x_ptr[0]; - xbuffer[3] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[4] = x_ptr[0]; - xbuffer[5] = x_ptr[1]; - x_ptr += inc_x; - xbuffer[6] = x_ptr[0]; - xbuffer[7] = x_ptr[1]; - x_ptr += inc_x; - - zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - } - - for (i = 0; i < n2; i++) { - xbuffer[0] = x_ptr[0]; - xbuffer[1] = x_ptr[1]; - x_ptr += inc_x; - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); - a_ptr += 1 * lda; - - } + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; - } + FLOAT xbuffer[8], *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 4; + n2 = n % 4; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; - add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); - a += 2 * NB; - y_ptr += NB * inc_y; - } + BLASLONG NB = NBMAX; - if (m3 == 0) - return (0); + while (NB == NBMAX) { - if (m3 == 1) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r = 0.0; - FLOAT temp_i = 0.0; + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } - if (lda == 2 && inc_x == 2) { + a_ptr = a; - for (i = 0; i < (n & -2); i += 2) { + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 2; + } + + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; #else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; - temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; #endif - a_ptr += 4; - x_ptr += 4; - } + a_ptr += 4; + x_ptr += 4; + } - for (; i < n; i++) { + for (; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; #else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; #endif - a_ptr += 2; - x_ptr += 2; - } + a_ptr += 2; + x_ptr += 2; + } - } else { + } else { - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; #else - temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; #endif - a_ptr += lda; - x_ptr += inc_x; - } + a_ptr += lda; + x_ptr += inc_x; + } - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - return (0); - } + return (0); + } - if (m3 == 2) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i1 = 0.0; + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; - if (lda == 4 && inc_x == 2) { + if (lda == 4 && inc_x == 2) { - for (i = 0; i < (n & -2); i += 2) { + for (i = 0; i < (n & -2); i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - - temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; - temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; - temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; - temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; #else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - - temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; - temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; - temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; - temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; - + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; #endif - a_ptr += 8; - x_ptr += 4; - } + a_ptr += 8; + x_ptr += 4; + } - for (; i < n; i++) { + for (; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; #else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; #endif - a_ptr += 4; - x_ptr += 2; - } + a_ptr += 4; + x_ptr += 2; + } - } else { + } else { - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; #else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; #endif - a_ptr += lda; - x_ptr += inc_x; - } - - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif - return (0); - } - - if (m3 == 3) { - a_ptr = a; - x_ptr = x; - FLOAT temp_r0 = 0.0; - FLOAT temp_i0 = 0.0; - FLOAT temp_r1 = 0.0; - FLOAT temp_i1 = 0.0; - FLOAT temp_r2 = 0.0; - FLOAT temp_i2 = 0.0; - - if (lda == 6 && inc_x == 2) { - - for (i = 0; i < n; i++) { + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; #else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; #endif - a_ptr += 6; - x_ptr += 2; - } + a_ptr += 6; + x_ptr += 2; + } - } else { + } else { - for (i = 0; i < n; i++) { + for (i = 0; i < n; i++) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; #else - temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; - temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; - temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; - temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; - temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; - temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; #endif - a_ptr += lda; - x_ptr += inc_x; - } - - } -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; - y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; - y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; - y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; #else - y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; - y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; - y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; - y_ptr += inc_y; - y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; - y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; #endif - return (0); - } + return (0); + } + + return (0); + } - return (0); -} diff --git a/kernel/zarch/zgemv_t_4.c b/kernel/zarch/zgemv_t_4.c index 031c31e29..8b2be8394 100644 --- a/kernel/zarch/zgemv_t_4.c +++ b/kernel/zarch/zgemv_t_4.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2019, The OpenBLAS Project +Copyright (c) 2018, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,635 +23,825 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ #include "common.h" #define NBMAX 1024 +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + register __vector double vtemp2_p = {0.0, 0.0}; + register __vector double vtemp2_r = {0.0, 0.0}; + register __vector double vtemp3_p = {0.0, 0.0}; + register __vector double vtemp3_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { +// __builtin_prefetch(&x[i]); +// __builtin_prefetch(&a0[i]); +// __builtin_prefetch(&a1[i]); +// __builtin_prefetch(&a2[i]); +// __builtin_prefetch(&a3[i]); + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + + register __vector double va2 = *(__vector double*) (&a2[i]); + register __vector double va2_1 = *(__vector double*) (&a2[i + 2]); + register __vector double va2_2 = *(__vector double*) (&a2[i + 4]); + register __vector double va2_3 = *(__vector double*) (&a2[i + 6]); + + register __vector double va3 = *(__vector double*) (&a3[i]); + register __vector double va3_1 = *(__vector double*) (&a3[i + 2]); + register __vector double va3_2 = *(__vector double*) (&a3[i + 4]); + register __vector double va3_3 = *(__vector double*) (&a3[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vtemp2_p += vx_0*va2; + vtemp2_r += vxr_0*va2; + + vtemp3_p += vx_0*va3; + vtemp3_r += vxr_0*va3; + + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp2_p += vx_1*va2_1; + vtemp2_r += vxr_1*va2_1; + + vtemp3_p += vx_1*va3_1; + vtemp3_r += vxr_1*va3_1; + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp2_p += vx_2*va2_2; + vtemp2_r += vxr_0*va2_2; + + vtemp3_p += vx_2*va3_2; + vtemp3_r += vxr_0*va3_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + + vtemp2_p += vx_3*va2_3; + vtemp2_r += vxr_1*va2_3; + + vtemp3_p += vx_3*va3_3; + vtemp3_r += vxr_1*va3_3; + + } -static void zgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - register FLOAT *ap2 = ap[2]; - register FLOAT *ap3 = ap[3]; - - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "vzero %%v20\n\t" - "vzero %%v21\n\t" - "vzero %%v22\n\t" - "vzero %%v23\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[ap2])\n\t" - "pfd 1,1024(%%r1,%[ap3])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + + register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1]; + register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1]; + + register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1]; + register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + #endif - "vlrepg %%v24,0(%%r1,%[ap0])\n\t" - "vlrepg %%v25,8(%%r1,%[ap0])\n\t" - "vlrepg %%v26,0(%%r1,%[ap1])\n\t" - "vlrepg %%v27,8(%%r1,%[ap1])\n\t" - "vlrepg %%v28,0(%%r1,%[ap2])\n\t" - "vlrepg %%v29,8(%%r1,%[ap2])\n\t" - "vlrepg %%v30,0(%%r1,%[ap3])\n\t" - "vlrepg %%v31,8(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" +} + +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_r3 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_i2 = 0.0; + FLOAT temp_i3 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] - a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] + a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] - a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] + a3[i + 1] * x[i]; #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; + temp_r2 += a2[i] * x[i] + a2[i + 1] * x[i + 1]; + temp_i2 += a2[i] * x[i + 1] - a2[i + 1] * x[i]; + temp_r3 += a3[i] * x[i] + a3[i + 1] * x[i + 1]; + temp_i3 += a3[i] * x[i + 1] - a3[i + 1] * x[i]; #endif - "vlrepg %%v24,16(%%r1,%[ap0])\n\t" - "vlrepg %%v25,24(%%r1,%[ap0])\n\t" - "vlrepg %%v26,16(%%r1,%[ap1])\n\t" - "vlrepg %%v27,24(%%r1,%[ap1])\n\t" - "vlrepg %%v28,16(%%r1,%[ap2])\n\t" - "vlrepg %%v29,24(%%r1,%[ap2])\n\t" - "vlrepg %%v30,16(%%r1,%[ap3])\n\t" - "vlrepg %%v31,24(%%r1,%[ap3])\n\t" - "vfmadb %%v16,%%v24,%%v0,%%v16\n\t" - "vfmadb %%v20,%%v25,%%v1,%%v20\n\t" - "vfmadb %%v17,%%v26,%%v0,%%v17\n\t" - "vfmadb %%v21,%%v27,%%v1,%%v21\n\t" - "vfmadb %%v18,%%v28,%%v0,%%v18\n\t" - "vfmadb %%v22,%%v29,%%v1,%%v22\n\t" - "vfmadb %%v19,%%v30,%%v0,%%v19\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v20\n\t" - "vfadb %%v17,%%v17,%%v21\n\t" - "vfadb %%v18,%%v18,%%v22\n\t" - "vfadb %%v19,%%v19,%%v23\n\t" - "vpdi %%v20,%%v16,%%v16,4\n\t" - "vpdi %%v21,%%v17,%%v17,4\n\t" - "vpdi %%v22,%%v18,%%v18,4\n\t" - "vpdi %%v23,%%v19,%%v19,4\n\t" + } + #if !defined(XCONJ) - "vlrepg %%v24,0(%[alpha])\n\t" - "vleg %%v25,8(%[alpha]),0\n\t" - "wflcdb %%v25,%%v25\n\t" - "vleg %%v25,8(%[alpha]),1\n\t" + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + #else - "vleg %%v24,0(%[alpha]),1\n\t" - "vflcdb %%v24,%%v24\n\t" - "vleg %%v24,0(%[alpha]),0\n\t" - "vlrepg %%v25,8(%[alpha])\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; + #endif - "vl %%v26,0(%[y])\n\t" - "vl %%v27,16(%[y])\n\t" - "vl %%v28,32(%[y])\n\t" - "vl %%v29,48(%[y])\n\t" - "vfmadb %%v26,%%v16,%%v24,%%v26\n\t" - "vfmadb %%v26,%%v20,%%v25,%%v26\n\t" - "vfmadb %%v27,%%v17,%%v24,%%v27\n\t" - "vfmadb %%v27,%%v21,%%v25,%%v27\n\t" - "vfmadb %%v28,%%v18,%%v24,%%v28\n\t" - "vfmadb %%v28,%%v22,%%v25,%%v28\n\t" - "vfmadb %%v29,%%v19,%%v24,%%v29\n\t" - "vfmadb %%v29,%%v23,%%v25,%%v29\n\t" - "vst %%v26,0(%[y])\n\t" - "vst %%v27,16(%[y])\n\t" - "vst %%v28,32(%[y])\n\t" - "vst %%v29,48(%[y])" - : "+m"(*(struct { FLOAT x[8]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap2),[ap2] "a"(ap2), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap3),[ap3] "a"(ap3), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); } -static void zgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - register FLOAT *ap0 = ap[0]; - register FLOAT *ap1 = ap[1]; - - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "vzero %%v18\n\t" - "vzero %%v19\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap0])\n\t" - "pfd 1,1024(%%r1,%[ap1])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + register __vector double vtemp1_p = {0.0, 0.0}; + register __vector double vtemp1_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double va1 = *(__vector double*) (&a1[i]); + register __vector double va1_1 = *(__vector double*) (&a1[i + 2]); + register __vector double va1_2 = *(__vector double*) (&a1[i + 4]); + register __vector double va1_3 = *(__vector double*) (&a1[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vtemp1_p += vx_0*va1; + vtemp1_r += vxr_0*va1; + + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vtemp1_p += vx_1*va1_1; + vtemp1_r += vxr_1*va1_1; + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp1_p += vx_2*va1_2; + vtemp1_r += vxr_0*va1_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + vtemp1_p += vx_3*va1_3; + vtemp1_r += vxr_1*va1_3; + + } + #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + + register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1]; + register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + #endif - "vlrepg %%v20,0(%%r1,%[ap0])\n\t" - "vlrepg %%v21,8(%%r1,%[ap0])\n\t" - "vlrepg %%v22,0(%%r1,%[ap1])\n\t" - "vlrepg %%v23,8(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" +} + +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + FLOAT temp_r0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_i1 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] - a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] + a1[i + 1] * x[i]; #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; + temp_r1 += a1[i] * x[i] + a1[i + 1] * x[i + 1]; + temp_i1 += a1[i] * x[i + 1] - a1[i + 1] * x[i]; #endif - "vlrepg %%v20,16(%%r1,%[ap0])\n\t" - "vlrepg %%v21,24(%%r1,%[ap0])\n\t" - "vlrepg %%v22,16(%%r1,%[ap1])\n\t" - "vlrepg %%v23,24(%%r1,%[ap1])\n\t" - "vfmadb %%v16,%%v20,%%v0,%%v16\n\t" - "vfmadb %%v18,%%v21,%%v1,%%v18\n\t" - "vfmadb %%v17,%%v22,%%v0,%%v17\n\t" - "vfmadb %%v19,%%v23,%%v1,%%v19\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v18\n\t" - "vfadb %%v17,%%v17,%%v19\n\t" - "vpdi %%v18,%%v16,%%v16,4\n\t" - "vpdi %%v19,%%v17,%%v17,4\n\t" + } + #if !defined(XCONJ) - "vlrepg %%v20,0(%[alpha])\n\t" - "vleg %%v21,8(%[alpha]),0\n\t" - "wflcdb %%v21,%%v21\n\t" - "vleg %%v21,8(%[alpha]),1\n\t" + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + #else - "vleg %%v20,0(%[alpha]),1\n\t" - "vflcdb %%v20,%%v20\n\t" - "vleg %%v20,0(%[alpha]),0\n\t" - "vlrepg %%v21,8(%[alpha])\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + #endif - "vl %%v22,0(%[y])\n\t" - "vl %%v23,16(%[y])\n\t" - "vfmadb %%v22,%%v16,%%v20,%%v22\n\t" - "vfmadb %%v22,%%v18,%%v21,%%v22\n\t" - "vfmadb %%v23,%%v17,%%v20,%%v23\n\t" - "vfmadb %%v23,%%v19,%%v21,%%v23\n\t" - "vst %%v22,0(%[y])\n\t" - "vst %%v23,16(%[y])\n\t" - : "+m"(*(struct { FLOAT x[4]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap0),[ap0] "a"(ap0), - "m"(*(const struct { FLOAT x[n * 2]; } *) ap1),[ap1] "a"(ap1), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23"); } -static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, - FLOAT *alpha) { - __asm__("vzero %%v16\n\t" - "vzero %%v17\n\t" - "xgr %%r1,%%r1\n\t" - "srlg %[n],%[n],1\n\t" - "0:\n\t" - "pfd 1,1024(%%r1,%[ap])\n\t" - "pfd 1,1024(%%r1,%[x])\n\t" - "vl %%v0,0(%%r1,%[x])\n\t" +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0 ; + a0 = ap; + //p for positive(real*real,image*image) r for image (real*image,image*real) + register __vector double vtemp0_p = {0.0, 0.0}; + register __vector double vtemp0_r = {0.0, 0.0}; + i = 0; + n = n << 1; + while (i < n) { + + register __vector double vx_0 = *(__vector double*) (&x[i]); + register __vector double vx_1 = *(__vector double*) (&x[i + 2]); + register __vector double vx_2 = *(__vector double*) (&x[i + 4]); + register __vector double vx_3 = *(__vector double*) (&x[i + 6]); + + register __vector double va0 = *(__vector double*) (&a0[i]); + register __vector double va0_1 = *(__vector double*) (&a0[i + 2]); + register __vector double va0_2 = *(__vector double*) (&a0[i + 4]); + register __vector double va0_3 = *(__vector double*) (&a0[i + 6]); + + register __vector double vxr_0 = vec_permi(vx_0, vx_0, 2); + register __vector double vxr_1 = vec_permi(vx_1, vx_1, 2); + + i += 8; + + vtemp0_p += vx_0*va0; + vtemp0_r += vxr_0*va0; + + vxr_0 = vec_permi(vx_2, vx_2, 2); + vtemp0_p += vx_1*va0_1; + vtemp0_r += vxr_1*va0_1; + + vxr_1 = vec_permi(vx_3, vx_3, 2); + + vtemp0_p += vx_2*va0_2; + vtemp0_r += vxr_0*va0_2; + + vtemp0_p += vx_3*va0_3; + vtemp0_r += vxr_1*va0_3; + + } + #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,8(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,0(%%r1,%[x]),1\n\t" + register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1]; + +#else + register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1]; + register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1]; + +#endif + +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + #else - "vleg %%v1,0(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%%r1,%[x]),0\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; #endif - "vlrepg %%v18,0(%%r1,%[ap])\n\t" - "vlrepg %%v19,8(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "vl %%v0,16(%%r1,%[x])\n\t" + +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + + for (i = 0; i < 2 * n; i += 2) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - "vleg %%v1,24(%%r1,%[x]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,16(%%r1,%[x]),1\n\t" + temp_r0 += a0[i] * x[i] - a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] + a0[i + 1] * x[i]; #else - "vleg %%v1,16(%%r1,%[x]),1\n\t" - "vflcdb %%v1,%%v1\n\t" - "vleg %%v1,24(%%r1,%[x]),0\n\t" + temp_r0 += a0[i] * x[i] + a0[i + 1] * x[i + 1]; + temp_i0 += a0[i] * x[i + 1] - a0[i + 1] * x[i]; #endif - "vlrepg %%v18,16(%%r1,%[ap])\n\t" - "vlrepg %%v19,24(%%r1,%[ap])\n\t" - "vfmadb %%v16,%%v18,%%v0,%%v16\n\t" - "vfmadb %%v17,%%v19,%%v1,%%v17\n\t" - "agfi %%r1,32\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v16,%%v16,%%v17\n\t" - "vpdi %%v17,%%v16,%%v16,4\n\t" + } + #if !defined(XCONJ) - "vlrepg %%v18,0(%[alpha])\n\t" - "vleg %%v19,8(%[alpha]),0\n\t" - "wflcdb %%v19,%%v19\n\t" - "vleg %%v19,8(%[alpha]),1\n\t" + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + #else - "vleg %%v18,0(%[alpha]),1\n\t" - "vflcdb %%v18,%%v18\n\t" - "vleg %%v18,0(%[alpha]),0\n\t" - "vlrepg %%v19,8(%[alpha])\n\t" + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + #endif - "vl %%v0,0(%[y])\n\t" - "vfmadb %%v0,%%v16,%%v18,%%v0\n\t" - "vfmadb %%v0,%%v17,%%v19,%%v0\n\t" - "vst %%v0,0(%[y])\n\t" - : "+m"(*(struct { FLOAT x[2]; } *) y),[n] "+&r"(n) - : [y] "a"(y), "m"(*(const struct { FLOAT x[n * 2]; } *) ap),[ap] "a"(ap), - "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x), - "m"(*(const struct { FLOAT x[2]; } *) alpha),[alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19"); + } -static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { - BLASLONG i; - for (i = 0; i < n; i++) { - *dest = *src; - *(dest + 1) = *(src + 1); - dest += 2; - src += inc_src; - } +#endif + +static __attribute__((always_inline)) void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { + BLASLONG i; + for (i = 0; i < n; i++) { + *dest = *src; + *(dest + 1) = *(src + 1); + dest += 2; + src += inc_src; + } } -int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, - FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, - BLASLONG inc_y, FLOAT *buffer) { - BLASLONG i; - BLASLONG j; - FLOAT *a_ptr; - FLOAT *x_ptr; - FLOAT *y_ptr; - FLOAT *ap[8]; - BLASLONG n1; - BLASLONG m1; - BLASLONG m2; - BLASLONG m3; - BLASLONG n2; - BLASLONG lda4; - FLOAT ybuffer[8], *xbuffer; - FLOAT alpha[2]; - - if (m < 1) - return (0); - if (n < 1) - return (0); +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; - inc_x <<= 1; - inc_y <<= 1; - lda <<= 1; - lda4 = lda << 2; + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; - xbuffer = buffer; + FLOAT ybuffer[8], *xbuffer; - n1 = n >> 2; - n2 = n & 3; + if (m < 1) return (0); + if (n < 1) return (0); - m3 = m & 3; - m1 = m - m3; - m2 = (m & (NBMAX - 1)) - m3; + inc_x <<= 1; + inc_y <<= 1; + lda <<= 1; - alpha[0] = alpha_r; - alpha[1] = alpha_i; + xbuffer = buffer; - BLASLONG NB = NBMAX; + n1 = n >> 2; + n2 = n & 3; - while (NB == NBMAX) { + m3 = m & 3; + m1 = m - m3; + m2 = (m & (NBMAX - 1)) - m3; - m1 -= NB; - if (m1 < 0) { - if (m2 == 0) - break; - NB = m2; - } + BLASLONG NB = NBMAX; - y_ptr = y; - a_ptr = a; - x_ptr = x; - ap[0] = a_ptr; - ap[1] = a_ptr + lda; - ap[2] = ap[1] + lda; - ap[3] = ap[2] + lda; - if (inc_x != 2) - copy_x(NB, x_ptr, xbuffer, inc_x); - else - xbuffer = x_ptr; - - if (inc_y == 2) { - - for (i = 0; i < n1; i++) { - zgemv_kernel_4x4(NB, ap, xbuffer, y_ptr, alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - y_ptr += 8; - - } - - if (n2 & 2) { - zgemv_kernel_4x2(NB, ap, xbuffer, y_ptr, alpha); - a_ptr += lda * 2; - y_ptr += 4; - - } - - if (n2 & 1) { - zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha); - /* a_ptr += lda; - y_ptr += 2; */ - - } - - } else { - - for (i = 0; i < n1; i++) { - memset(ybuffer, 0, sizeof(ybuffer)); - zgemv_kernel_4x4(NB, ap, xbuffer, ybuffer, alpha); - ap[0] += lda4; - ap[1] += lda4; - ap[2] += lda4; - ap[3] += lda4; - a_ptr += lda4; - - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[2]; - y_ptr[1] += ybuffer[3]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[4]; - y_ptr[1] += ybuffer[5]; - y_ptr += inc_y; - y_ptr[0] += ybuffer[6]; - y_ptr[1] += ybuffer[7]; - y_ptr += inc_y; - - } - - for (i = 0; i < n2; i++) { - memset(ybuffer, 0, sizeof(ybuffer)); - zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha); - a_ptr += lda; - y_ptr[0] += ybuffer[0]; - y_ptr[1] += ybuffer[1]; - y_ptr += inc_y; - - } + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + y_ptr = y; + a_ptr = a; + x_ptr = x; + if (inc_x != 2) + copy_x(NB, x_ptr, xbuffer, inc_x); + else + xbuffer = x_ptr; + + if (inc_y == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 2; + y_ptr += 8; + + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda << 1; + y_ptr += 4; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); + a_ptr += lda; + y_ptr += 2; + + } + + } else { + + for (i = 0; i < n1; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + + a_ptr += lda << 2; + + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[2]; + y_ptr[1] += ybuffer[3]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[4]; + y_ptr[1] += ybuffer[5]; + y_ptr += inc_y; + y_ptr[0] += ybuffer[6]; + y_ptr[1] += ybuffer[7]; + y_ptr += inc_y; + + } + + for (i = 0; i < n2; i++) { + memset(ybuffer, 0, sizeof (ybuffer)); + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); + a_ptr += lda; + y_ptr[0] += ybuffer[0]; + y_ptr[1] += ybuffer[1]; + y_ptr += inc_y; + + } + + } + a += 2 * NB; + x += NB * inc_x; } - a += 2 * NB; - x += NB * inc_x; - } - if (m3 == 0) - return (0); + if (m3 == 0) return (0); + + x_ptr = x; + j = 0; + a_ptr = a; + y_ptr = y; - x_ptr = x; - j = 0; - a_ptr = a; - y_ptr = y; - - if (m3 == 3) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x4 = x_ptr[0]; - FLOAT x5 = x_ptr[1]; - while (j < n) { + if (m3 == 3) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x4 = x_ptr[0]; + FLOAT x5 = x_ptr[1]; + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; - temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; + temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; #endif -#if !defined(XCONJ) - y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; - y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; - y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); } - return (0); - } - - if (m3 == 2) { - - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - x_ptr += inc_x; - FLOAT x2 = x_ptr[0]; - FLOAT x3 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; - - while (j < (n & -2)) { + + if (m3 == 2) { + + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; + x_ptr += inc_x; + FLOAT x2 = x_ptr[0]; + FLOAT x3 = x_ptr[1]; + + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while (j < n) { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; - temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; + temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; - } + a_ptr += lda; + y_ptr += inc_y; + j++; + } - return (0); - } + return (0); + } - if (m3 == 1) { + if (m3 == 1) { - FLOAT temp_r; - FLOAT temp_i; - FLOAT temp_r1; - FLOAT temp_i1; - FLOAT x0 = x_ptr[0]; - FLOAT x1 = x_ptr[1]; - FLOAT ar = alpha[0]; - FLOAT ai = alpha[1]; + FLOAT temp_r; + FLOAT temp_i; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT x0 = x_ptr[0]; + FLOAT x1 = x_ptr[1]; - while (j < (n & -2)) { + while (j < (n & -2)) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; - a_ptr += lda; - temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + a_ptr += lda; + temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 - ai * temp_i1; - y_ptr[1] += ar * temp_i1 + ai * temp_r1; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; - y_ptr += inc_y; - y_ptr[0] += ar * temp_r1 + ai * temp_i1; - y_ptr[1] -= ar * temp_i1 - ai * temp_r1; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; #endif - a_ptr += lda; - y_ptr += inc_y; - j += 2; - } + a_ptr += lda; + y_ptr += inc_y; + j += 2; + } - while (j < n) { + while (j < n) { #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) - temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; #else - temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; - temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; + temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; + temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; #endif -#if !defined(XCONJ) - y_ptr[0] += ar * temp_r - ai * temp_i; - y_ptr[1] += ar * temp_i + ai * temp_r; +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; #else - y_ptr[0] += ar * temp_r + ai * temp_i; - y_ptr[1] -= ar * temp_i - ai * temp_r; + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; #endif - a_ptr += lda; - y_ptr += inc_y; - j++; + a_ptr += lda; + y_ptr += inc_y; + j++; + } + return (0); } + return (0); - } - return (0); } + diff --git a/kernel/zarch/zrot.c b/kernel/zarch/zrot.c index 6284d5a47..380f0140e 100644 --- a/kernel/zarch/zrot.c +++ b/kernel/zarch/zrot.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -27,210 +27,235 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) { - __asm__("vlrepg %%v0,%[c]\n\t" - "vlrepg %%v1,%[s]\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v24, 0(%%r1,%[x])\n\t" - "vl %%v25, 16(%%r1,%[x])\n\t" - "vl %%v26, 32(%%r1,%[x])\n\t" - "vl %%v27, 48(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[y])\n\t" - "vl %%v17, 16(%%r1,%[y])\n\t" - "vl %%v18, 32(%%r1,%[y])\n\t" - "vl %%v19, 48(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 0(%%r1,%[x])\n\t" - "vst %%v29, 16(%%r1,%[x])\n\t" - "vst %%v30, 32(%%r1,%[x])\n\t" - "vst %%v31, 48(%%r1,%[x])\n\t" - "vst %%v20, 0(%%r1,%[y])\n\t" - "vst %%v21, 16(%%r1,%[y])\n\t" - "vst %%v22, 32(%%r1,%[y])\n\t" - "vst %%v23, 48(%%r1,%[y])\n\t" - "vl %%v24, 64(%%r1,%[x])\n\t" - "vl %%v25, 80(%%r1,%[x])\n\t" - "vl %%v26, 96(%%r1,%[x])\n\t" - "vl %%v27, 112(%%r1,%[x])\n\t" - "vl %%v16, 64(%%r1,%[y])\n\t" - "vl %%v17, 80(%%r1,%[y])\n\t" - "vl %%v18, 96(%%r1,%[y])\n\t" - "vl %%v19, 112(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 64(%%r1,%[x])\n\t" - "vst %%v29, 80(%%r1,%[x])\n\t" - "vst %%v30, 96(%%r1,%[x])\n\t" - "vst %%v31, 112(%%r1,%[x])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v16, 128(%%r1,%[y])\n\t" - "vl %%v17, 144(%%r1,%[y])\n\t" - "vl %%v18, 160(%%r1,%[y])\n\t" - "vl %%v19, 176(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 128(%%r1,%[x])\n\t" - "vst %%v29, 144(%%r1,%[x])\n\t" - "vst %%v30, 160(%%r1,%[x])\n\t" - "vst %%v31, 176(%%r1,%[x])\n\t" - "vst %%v20, 128(%%r1,%[y])\n\t" - "vst %%v21, 144(%%r1,%[y])\n\t" - "vst %%v22, 160(%%r1,%[y])\n\t" - "vst %%v23, 176(%%r1,%[y])\n\t" - "vl %%v24, 192(%%r1,%[x])\n\t" - "vl %%v25, 208(%%r1,%[x])\n\t" - "vl %%v26, 224(%%r1,%[x])\n\t" - "vl %%v27, 240(%%r1,%[x])\n\t" - "vl %%v16, 192(%%r1,%[y])\n\t" - "vl %%v17, 208(%%r1,%[y])\n\t" - "vl %%v18, 224(%%r1,%[y])\n\t" - "vl %%v19, 240(%%r1,%[y])\n\t" - "vfmdb %%v28,%%v24,%%v0\n\t" - "vfmdb %%v29,%%v25,%%v0\n\t" - "vfmdb %%v20,%%v24,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v21,%%v25,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v30,%%v26,%%v0\n\t" - "vfmdb %%v22,%%v26,%%v1\n\t" /* yn=x*s */ - "vfmdb %%v31,%%v27,%%v0\n\t" - "vfmdb %%v23,%%v27,%%v1\n\t" /* yn=x*s */ - /* 2nd parts */ - "vfmadb %%v28,%%v16,%%v1,%%v28\n\t" - "vfmsdb %%v20,%%v16,%%v0,%%v20\n\t" /* yn=y*c-yn */ - "vfmadb %%v29,%%v17,%%v1,%%v29\n\t" - "vfmsdb %%v21,%%v17,%%v0,%%v21\n\t" /* yn=y*c-yn */ - "vfmadb %%v30,%%v18,%%v1,%%v30\n\t" - "vfmsdb %%v22,%%v18,%%v0,%%v22\n\t" /* yn=y*c-yn */ - "vfmadb %%v31,%%v19,%%v1,%%v31\n\t" - "vfmsdb %%v23,%%v19,%%v0,%%v23\n\t" /* yn=y*c-yn */ - "vst %%v28, 192(%%r1,%[x])\n\t" - "vst %%v29, 208(%%r1,%[x])\n\t" - "vst %%v30, 224(%%r1,%[x])\n\t" - "vst %%v31, 240(%%r1,%[x])\n\t" - "vst %%v20, 192(%%r1,%[y])\n\t" - "vst %%v21, 208(%%r1,%[y])\n\t" - "vst %%v22, 224(%%r1,%[y])\n\t" - "vst %%v23, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y),[c] "Q"(*c),[s] "Q"(*s) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); +static void zrot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT cosA, FLOAT sinA) +{ + __asm__ ( + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "lgdr %%r1,%[cos] \n\t" + "vlvgp %%v0,%%r1,%%r1 \n\t" + "lgdr %%r1,%[sin] \n\t" + "vlvgp %%v1,%%r1,%%r1 \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 48(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v27,112(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v19,112(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 176(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v17, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v18, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v19, 240(%%r1,%[ptr_y]) \n\t" + + "vfmdb %%v28,%%v24,%%v0 \n\t" + "vfmdb %%v29,%%v25,%%v0 \n\t" + "vfmdb %%v20,%%v24,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v21,%%v25,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v30,%%v26,%%v0 \n\t" + "vfmdb %%v22,%%v26,%%v1 \n\t" /* yn=x*s */ + "vfmdb %%v31,%%v27,%%v0 \n\t" + "vfmdb %%v23,%%v27,%%v1 \n\t" /* yn=x*s */ + /* 2nd parts*/ + "vfmadb %%v28,%%v16,%%v1,%%v28 \n\t" + "vfmsdb %%v20,%%v16,%%v0,%%v20 \n\t" /* yn=y*c-yn */ + "vfmadb %%v29,%%v17,%%v1,%%v29 \n\t" + "vfmsdb %%v21,%%v17,%%v0,%%v21 \n\t" /* yn=y*c-yn */ + "vfmadb %%v30,%%v18,%%v1,%%v30 \n\t" + "vfmsdb %%v22,%%v18,%%v0,%%v22 \n\t" /* yn=y*c-yn */ + "vfmadb %%v31,%%v19,%%v1,%%v31 \n\t" + "vfmsdb %%v23,%%v19,%%v0,%%v23 \n\t" /* yn=y*c-yn */ + + "vst %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_y]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "clgrjl %%r1,%[tmp],1b \n\t" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y),[cos] "f"(cosA),[sin] "f"(sinA) + : "cc","r1" ,"v0","v1","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + } -int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT c, FLOAT s) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp[2]; - BLASLONG inc_x2; - BLASLONG inc_y2; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - FLOAT cosa, sina; - cosa = c; - sina = s; - zrot_kernel_16(n1, x, y, &cosa, &sina); - i = n1; - ix = 2 * n1; - } +int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; - while (i < n) { - temp[0] = c * x[ix] + s * y[ix]; - temp[1] = c * x[ix + 1] + s * y[ix + 1]; - y[ix] = c * y[ix] - s * x[ix]; - y[ix + 1] = c * y[ix + 1] - s * x[ix + 1]; - x[ix] = temp[0]; - x[ix + 1] = temp[1]; + if ( n <= 0 ) return(0); - ix += 2; - i++; + if ( (inc_x == 1) && (inc_y == 1) ) + { - } + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + zrot_kernel_16(n1, x, y, c, s); + i=n1; + ix=2*n1; + } + + while(i < n) + { + temp[0] = c*x[ix] + s*y[ix] ; + temp[1] = c*x[ix+1] + s*y[ix+1] ; + y[ix] = c*y[ix] - s*x[ix] ; + y[ix+1] = c*y[ix+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; + + ix += 2 ; + i++ ; - } else { - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; - while (i < n) { - temp[0] = c * x[ix] + s * y[iy]; - temp[1] = c * x[ix + 1] + s * y[iy + 1]; - y[iy] = c * y[iy] - s * x[ix]; - y[iy + 1] = c * y[iy + 1] - s * x[ix + 1]; - x[ix] = temp[0]; - x[ix + 1] = temp[1]; - - ix += inc_x2; - iy += inc_y2; - i++; + } } + else + { + inc_x2 = 2 * inc_x ; + inc_y2 = 2 * inc_y ; + while(i < n) + { + temp[0] = c*x[ix] + s*y[iy] ; + temp[1] = c*x[ix+1] + s*y[iy+1] ; + y[iy] = c*y[iy] - s*x[ix] ; + y[iy+1] = c*y[iy+1] - s*x[ix+1] ; + x[ix] = temp[0] ; + x[ix+1] = temp[1] ; - } - return (0); + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + } + + } + return(0); + } + diff --git a/kernel/zarch/zscal.c b/kernel/zarch/zscal.c index e497a6d7b..4764c0a52 100644 --- a/kernel/zarch/zscal.c +++ b/kernel/zarch/zscal.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013 - 2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -23,403 +23,490 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ + *****************************************************************************/ #include "common.h" -static void zscal_kernel_8(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "vleg %%v1,8(%[alpha]),0\n\t" - "wflcdb %%v1,%%v1\n\t" - "vleg %%v1,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v24,%%v16,%%v16,4\n\t" - "vpdi %%v25,%%v17,%%v17,4\n\t" - "vpdi %%v26,%%v18,%%v18,4\n\t" - "vpdi %%v27,%%v19,%%v19,4\n\t" - "vpdi %%v28,%%v20,%%v20,4\n\t" - "vpdi %%v29,%%v21,%%v21,4\n\t" - "vpdi %%v30,%%v22,%%v22,4\n\t" - "vpdi %%v31,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vfmadb %%v16,%%v24,%%v1,%%v16\n\t" - "vfmadb %%v17,%%v25,%%v1,%%v17\n\t" - "vfmadb %%v18,%%v26,%%v1,%%v18\n\t" - "vfmadb %%v19,%%v27,%%v1,%%v19\n\t" - "vfmadb %%v20,%%v28,%%v1,%%v20\n\t" - "vfmadb %%v21,%%v29,%%v1,%%v21\n\t" - "vfmadb %%v22,%%v30,%%v1,%%v22\n\t" - "vfmadb %%v23,%%v31,%%v1,%%v23\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", - "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", - "v31"); -} + + +static void zscal_kernel_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x) { + BLASLONG tempR1 ; + __asm__ ( + "pfd 2, 0(%[x_tmp]) \n\t" +#if !defined(CONJ) + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v29,%%v29 \n\t" //complement both + "vlvgg %%v29,%[t1],1 \n\t" //restore 2nd so that {-alpha_i, alpha_i} + +#else + "lgdr %[t1],%[alpha_i] \n\t" + "vlvgp %%v29,%[t1],%[t1] \n\t" //load both from disjoint + "lgdr %[t1],%[alpha_r] \n\t" + "vlvgp %%v28,%[t1],%[t1] \n\t" //load both from disjoint + "vflcdb %%v28,%%v28 \n\t" //complement both + "vlvgg %%v28,%[t1],0 \n\t" //restore 1st so that {alpha_r,-alpha_r} +#endif + + "xgr %[t1],%[t1] \n\t" + "sllg %[tmp],%[tmp],4 \n\t" + "vl %%v20 , 0(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 16(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 32(%[t1],%[x_tmp]) \n\t" + "vl %%v23 , 48(%[t1],%[x_tmp]) \n\t" + + "lay %[tmp],-64 (%[tmp]) \n\t" //tmp-=64 so that t1+64 can break tmp condition + "j 2f \n\t" + ".align 16 \n\t" + "1: \n\t" + + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmdb %%v16, %%v20, %%v28 \n\t" + "vfmdb %%v17, %%v21, %%v28 \n\t" + "vfmdb %%v18, %%v22, %%v28 \n\t" + "vfmdb %%v19, %%v23, %%v28 \n\t" + "vl %%v20, 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21, 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22, 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23, 112(%[t1],%[x_tmp]) \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + + + "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + "2: \n\t" + "pfd 2, 256(%[t1],%[x_tmp]) \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + + "vfmdb %%v30, %%v20, %%v28 \n\t" + "vfmdb %%v31, %%v21, %%v28 \n\t" + "vfmdb %%v6, %%v22, %%v28 \n\t" + "vfmdb %%v7, %%v23, %%v28 \n\t" + + "vl %%v20 , 64(%[t1],%[x_tmp]) \n\t" + "vl %%v21 , 80(%[t1],%[x_tmp]) \n\t" + "vl %%v22 , 96(%[t1],%[x_tmp]) \n\t" + "vl %%v23 ,112(%[t1],%[x_tmp]) \n\t" + + "vfmadb %%v30, %%v24, %%v29, %%v30 \n\t" + "vfmadb %%v31, %%v25, %%v29, %%v31 \n\t" + "vfmadb %%v6, %%v26, %%v29, %%v6 \n\t" + "vfmadb %%v7, %%v27, %%v29, %%v7 \n\t" + + + "vst %%v30 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v31 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v6 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v7 , 48(%[t1],%[x_tmp]) \n\t" + + "la %[t1],64(%[t1] ) \n\t" + + + "clgrjl %[t1],%[tmp],1b \n\t" +//---------------------------------------------------------------------- + "vfmdb %%v16, %%v20, %%v28 \n\t" + "vfmdb %%v17, %%v21, %%v28 \n\t" + "vfmdb %%v18, %%v22, %%v28 \n\t" + "vfmdb %%v19, %%v23, %%v28 \n\t" + "vpdi %%v24 , %%v20, %%v20, 4 \n\t" + "vpdi %%v25 , %%v21, %%v21, 4 \n\t" + "vpdi %%v26 , %%v22, %%v22, 4 \n\t" + "vpdi %%v27 , %%v23, %%v23, 4 \n\t" + "vfmadb %%v16, %%v24, %%v29, %%v16 \n\t" + "vfmadb %%v17, %%v25, %%v29, %%v17 \n\t" + "vfmadb %%v18, %%v26, %%v29, %%v18 \n\t" + "vfmadb %%v19, %%v27, %%v29, %%v19 \n\t" + + "vst %%v16 , 0(%[t1],%[x_tmp]) \n\t" + "vst %%v17 , 16(%[t1],%[x_tmp]) \n\t" + "vst %%v18 , 32(%[t1],%[x_tmp]) \n\t" + "vst %%v19 , 48(%[t1],%[x_tmp]) \n\t" + + : [mem_x] "+m" (*(double (*)[2*n])x),[tmp]"+&r"(n) , [t1] "=&a" (tempR1) + : [x_tmp] "a"(x), [alpha_r] "f"(da_r),[alpha_i] "f"(da_i) + : "cc", "v6","v7", "v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + + -static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vleg %%v0,8(%[alpha]),0\n\t" - "wflcdb %%v0,%%v0\n\t" - "vleg %%v0,8(%[alpha]),1\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vpdi %%v16,%%v16,%%v16,4\n\t" - "vpdi %%v17,%%v17,%%v17,4\n\t" - "vpdi %%v18,%%v18,%%v18,4\n\t" - "vpdi %%v19,%%v19,%%v19,4\n\t" - "vpdi %%v20,%%v20,%%v20,4\n\t" - "vpdi %%v21,%%v21,%%v21,4\n\t" - "vpdi %%v22,%%v22,%%v22,4\n\t" - "vpdi %%v23,%%v23,%%v23,4\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); } + +static void zscal_kernel_8_zero_r(BLASLONG n, FLOAT da_i, FLOAT *x) { + + __asm__ ( "pfd 2, 0(%1) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v16,%%r0,%%r0 \n\t" //load both from disjoint + "vflcdb %%v16,%%v16 \n\t" //complement both + "vlvgg %%v16,%%r0,0 \n\t" //restore 1st + "vlr %%v17 ,%%v16 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "vl %%v24, 0(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v16 \n\t" + "vsteg %%v24, 0(%[x_ptr]),1 \n\t" + "vsteg %%v24, 8(%[x_ptr]),0 \n\t" + "vl %%v25, 16(%[x_ptr]) \n\t" + "vfmdb %%v25,%%v25,%%v17 \n\t" + "vsteg %%v25, 16(%[x_ptr]),1 \n\t" + "vsteg %%v25, 24(%[x_ptr]),0 \n\t" + "vl %%v26, 32(%[x_ptr]) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vsteg %%v26, 32(%[x_ptr]),1 \n\t" + "vsteg %%v26, 40(%[x_ptr]),0 \n\t" + "vl %%v27, 48(%[x_ptr]) \n\t" + "vfmdb %%v27,%%v27,%%v17 \n\t" + "vsteg %%v27, 48(%[x_ptr]),1 \n\t" + "vsteg %%v27, 56(%[x_ptr]),0 \n\t" + "vl %%v28, 64(%[x_ptr]) \n\t" + "vfmdb %%v28,%%v28,%%v16 \n\t" + "vsteg %%v28, 64(%[x_ptr]),1 \n\t" + "vsteg %%v28, 72(%[x_ptr]),0 \n\t" + "vl %%v29, 80(%[x_ptr]) \n\t" + "vfmdb %%v29,%%v29,%%v17 \n\t" + "vsteg %%v29, 80(%[x_ptr]),1 \n\t" + "vsteg %%v29, 88(%[x_ptr]),0 \n\t" + "vl %%v30, 96(%[x_ptr]) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vsteg %%v30, 96(%[x_ptr]),1 \n\t" + "vsteg %%v30, 104(%[x_ptr]),0 \n\t" + "vl %%v31, 112(%[x_ptr]) \n\t" + "vfmdb %%v31,%%v31,%%v17 \n\t" + "vsteg %%v31, 112(%[x_ptr]),1 \n\t" + "vsteg %%v31, 120(%[x_ptr]),0 \n\t" + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da_i) + :"cc", "r0","f0", "f1","v16","v17" ,"v24","v25","v26","v27","v28","v29","v30","v31" + ); + -static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT *alpha, FLOAT *x) { - __asm__("vlrepg %%v0,0(%[alpha])\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vl %%v16,0(%%r1,%[x])\n\t" - "vl %%v17,16(%%r1,%[x])\n\t" - "vl %%v18,32(%%r1,%[x])\n\t" - "vl %%v19,48(%%r1,%[x])\n\t" - "vl %%v20,64(%%r1,%[x])\n\t" - "vl %%v21,80(%%r1,%[x])\n\t" - "vl %%v22,96(%%r1,%[x])\n\t" - "vl %%v23,112(%%r1,%[x])\n\t" - "vfmdb %%v16,%%v16,%%v0\n\t" - "vfmdb %%v17,%%v17,%%v0\n\t" - "vfmdb %%v18,%%v18,%%v0\n\t" - "vfmdb %%v19,%%v19,%%v0\n\t" - "vfmdb %%v20,%%v20,%%v0\n\t" - "vfmdb %%v21,%%v21,%%v0\n\t" - "vfmdb %%v22,%%v22,%%v0\n\t" - "vfmdb %%v23,%%v23,%%v0\n\t" - "vst %%v16,0(%%r1,%[x])\n\t" - "vst %%v17,16(%%r1,%[x])\n\t" - "vst %%v18,32(%%r1,%[x])\n\t" - "vst %%v19,48(%%r1,%[x])\n\t" - "vst %%v20,64(%%r1,%[x])\n\t" - "vst %%v21,80(%%r1,%[x])\n\t" - "vst %%v22,96(%%r1,%[x])\n\t" - "vst %%v23,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x), "m"(*(const struct { FLOAT x[2]; } *) alpha), - [alpha] "a"(alpha) - : "cc", "r1", "v0", "v16", "v17", "v18", "v19", "v20", "v21", "v22", - "v23"); } -static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { - __asm__("vzero %%v0\n\t" - "srlg %[n],%[n],3\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "vst %%v0,0(%%r1,%[x])\n\t" - "vst %%v0,16(%%r1,%[x])\n\t" - "vst %%v0,32(%%r1,%[x])\n\t" - "vst %%v0,48(%%r1,%[x])\n\t" - "vst %%v0,64(%%r1,%[x])\n\t" - "vst %%v0,80(%%r1,%[x])\n\t" - "vst %%v0,96(%%r1,%[x])\n\t" - "vst %%v0,112(%%r1,%[x])\n\t" - "agfi %%r1,128\n\t" - "brctg %[n],0b" - : "=m"(*(struct { FLOAT x[n * 2]; } *) x),[n] "+&r"(n) - : [x] "a"(x) - : "cc", "r1", "v0"); +static void zscal_kernel_8_zero_i(BLASLONG n, FLOAT da_r, FLOAT *x) { + __asm__ ("pfd 2, 0(%[x_ptr]) \n\t" + "lgdr %%r0,%[alpha] \n\t" + "vlvgp %%v18,%%r0,%%r0 \n\t" + "vlr %%v19,%%v18 \n\t" + "vlr %%v16,%%v18 \n\t" + "vlr %%v17,%%v18 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "vl %%v24, 0(%[x_ptr]) \n\t" + "vfmdb %%v24,%%v24,%%v18 \n\t" + "vst %%v24, 0(%[x_ptr]) \n\t" + "vl %%v25, 16(%[x_ptr]) \n\t" + "vfmdb %%v25,%%v25,%%v19 \n\t" + "vst %%v25, 16(%[x_ptr]) \n\t" + "vl %%v26, 32(%[x_ptr]) \n\t" + "vfmdb %%v26,%%v26,%%v16 \n\t" + "vst %%v26, 32(%[x_ptr]) \n\t" + "vl %%v27, 48(%[x_ptr]) \n\t" + "vfmdb %%v27,%%v27,%%v17 \n\t" + "vst %%v27, 48(%[x_ptr]) \n\t" + "vl %%v28, 64(%[x_ptr]) \n\t" + "vfmdb %%v28,%%v28,%%v18 \n\t" + "vst %%v28, 64(%[x_ptr]) \n\t" + "vl %%v29, 80(%[x_ptr]) \n\t" + "vfmdb %%v29,%%v29,%%v19 \n\t" + "vst %%v29, 80(%[x_ptr]) \n\t" + "vl %%v30, 96(%[x_ptr]) \n\t" + "vfmdb %%v30,%%v30,%%v16 \n\t" + "vst %%v30, 96(%[x_ptr]) \n\t" + "vl %%v31,112(%[x_ptr]) \n\t" + "vfmdb %%v31,%%v31,%%v17 \n\t" + "vst %%v31,112(%[x_ptr]) \n\t" + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x) ,[x_ptr] "+&a"(x) + : [n] "r"(n),[alpha] "f"(da_r) + : "cc", "r0","v16", "v17","v18","v19","v24","v25","v26","v27","v28","v29","v30","v31" + ); + } -static void zscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, - BLASLONG inc_x) { - BLASLONG i; - BLASLONG inc_x2 = 2 * inc_x; - BLASLONG inc_x3 = inc_x2 + inc_x; - FLOAT t0, t1, t2, t3; - FLOAT da_r = alpha[0]; - FLOAT da_i = alpha[1]; - - for (i = 0; i < n; i += 4) { - t0 = da_r * x[0] - da_i * x[1]; - t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; - t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; - t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - - x[1] = da_i * x[0] + da_r * x[1]; - x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; - x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; - x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - - x[0] = t0; - x[inc_x] = t1; - x[inc_x2] = t2; - x[inc_x3] = t3; - - x += 4 * inc_x; - } +static void zscal_kernel_8_zero(BLASLONG n, FLOAT *x) { + + __asm__ ( "pfd 2, 0(%[x_ptr]) \n\t" + "vzero %%v24 \n\t" + "vzero %%v25 \n\t" + "vzero %%v26 \n\t" + "vzero %%v27 \n\t" + "sllg %%r0,%[n],4 \n\t" + "agr %%r0,%[x_ptr] \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256( %[x_ptr]) \n\t" + "vst %%v24, 0( %[x_ptr]) \n\t" + "vst %%v25, 16( %[x_ptr]) \n\t" + "vst %%v26, 32( %[x_ptr]) \n\t" + "vst %%v27, 48( %[x_ptr]) \n\t" + "vst %%v24, 64( %[x_ptr]) \n\t" + "vst %%v25, 80( %[x_ptr]) \n\t" + "vst %%v26, 96( %[x_ptr]) \n\t" + "vst %%v27,112( %[x_ptr]) \n\t" + + "la %[x_ptr],128(%[x_ptr]) \n\t" + "clgrjl %[x_ptr],%%r0,1b \n\t" + : [mem] "+m" (*(double (*)[2*n])x),[x_ptr] "+&a"(x) + : [n] "r"(n) + :"cc" ,"r0","v24","v25","v26","v27" + ); + } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, - FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, - BLASLONG dummy2) { - BLASLONG i = 0, j = 0; - FLOAT temp0; - FLOAT temp1; - FLOAT alpha[2] __attribute__ ((aligned(16))); - if (inc_x != 1) { - inc_x <<= 1; - if (da_r == 0.0) { - BLASLONG n1 = n & -2; - if (da_i == 0.0) { +static void zscal_kernel_inc_8(BLASLONG n, FLOAT da_r,FLOAT da_i, FLOAT *x, BLASLONG inc_x) { - while (j < n1) { + BLASLONG i; + BLASLONG inc_x2 = 2 * inc_x; + BLASLONG inc_x3 = inc_x2 + inc_x; + FLOAT t0, t1, t2, t3; - x[i] = 0.0; - x[i + 1] = 0.0; - x[i + inc_x] = 0.0; - x[i + 1 + inc_x] = 0.0; - i += 2 * inc_x; - j += 2; + for (i = 0; i < n; i += 4) { + t0 = da_r * x[0] - da_i * x[1]; + t1 = da_r * x[inc_x] - da_i * x[inc_x + 1]; + t2 = da_r * x[inc_x2] - da_i * x[inc_x2 + 1]; + t3 = da_r * x[inc_x3] - da_i * x[inc_x3 + 1]; - } + x[1] = da_i * x[0] + da_r * x[1]; + x[inc_x + 1] = da_i * x[inc_x] + da_r * x[inc_x + 1]; + x[inc_x2 + 1] = da_i * x[inc_x2] + da_r * x[inc_x2 + 1]; + x[inc_x3 + 1] = da_i * x[inc_x3] + da_r * x[inc_x3 + 1]; - while (j < n) { + x[0] = t0; + x[inc_x] = t1; + x[inc_x2] = t2; + x[inc_x3] = t3; - x[i] = 0.0; - x[i + 1] = 0.0; - i += inc_x; - j++; + x += 4 * inc_x; - } + } - } else { - while (j < n1) { +} - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - temp1 = -da_i * x[i + 1 + inc_x]; - x[i + 1 + inc_x] = da_i * x[i + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { + BLASLONG i = 0, j = 0; + FLOAT temp0; + FLOAT temp1; - } - while (j < n) { + if (inc_x != 1) { + inc_x <<= 1; - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + if (da_r == 0.0) { - } + BLASLONG n1 = n & -2; - } + if (da_i == 0.0) { - } else { + while (j < n1) { - if (da_i == 0.0) { - BLASLONG n1 = n & -2; + x[i] = 0.0; + x[i + 1] = 0.0; + x[i + inc_x] = 0.0; + x[i + 1 + inc_x] = 0.0; + i += 2 * inc_x; + j += 2; - while (j < n1) { + } - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - temp1 = da_r * x[i + inc_x]; - x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; - x[i + inc_x] = temp1; - i += 2 * inc_x; - j += 2; + while (j < n) { - } + x[i] = 0.0; + x[i + 1] = 0.0; + i += inc_x; + j++; - while (j < n) { + } - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += inc_x; - j++; + } else { - } + while (j < n1) { - } else { + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + temp1 = -da_i * x[i + 1 + inc_x]; + x[i + 1 + inc_x] = da_i * x[i + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - BLASLONG n1 = n & -8; - if (n1 > 0) { - alpha[0] = da_r; - alpha[1] = da_i; - zscal_kernel_inc_8(n1, alpha, x, inc_x); - j = n1; - i = n1 * inc_x; - } + } - while (j < n) { + while (j < n) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += inc_x; - j++; + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; - } + } - } - } - return (0); - } + } - BLASLONG n1 = n & -8; - if (n1 > 0) { + } else { - alpha[0] = da_r; - alpha[1] = da_i; - if (da_r == 0.0) - if (da_i == 0) - zscal_kernel_8_zero(n1, x); - else - zscal_kernel_8_zero_r(n1, alpha, x); - else if (da_i == 0) - zscal_kernel_8_zero_i(n1, alpha, x); - else - zscal_kernel_8(n1, alpha, x); + if (da_i == 0.0) { + BLASLONG n1 = n & -2; - i = n1 << 1; - j = n1; - } + while (j < n1) { - if (da_r == 0.0) { + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + temp1 = da_r * x[i + inc_x]; + x[i + 1 + inc_x] = da_r * x[i + 1 + inc_x]; + x[i + inc_x] = temp1; + i += 2 * inc_x; + j += 2; - if (da_i == 0.0) { + } - while (j < n) { + while (j < n) { - x[i] = 0.0; - x[i + 1] = 0.0; - i += 2; - j++; + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += inc_x; + j++; - } + } - } else { + } else { + + BLASLONG n1 = n & -8; + if (n1 > 0) { + zscal_kernel_inc_8(n1, da_r,da_i, x, inc_x); + j = n1; + i = n1 * inc_x; + } + + while (j < n) { - while (j < n) { + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += inc_x; + j++; - temp0 = -da_i * x[i + 1]; - x[i + 1] = da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + } - } + } + } + + return (0); } - } else { - if (da_i == 0.0) { + BLASLONG n1 = n & -8; + if (n1 > 0) { - while (j < n) { - temp0 = da_r * x[i]; - x[i + 1] = da_r * x[i + 1]; - x[i] = temp0; - i += 2; - j++; + if (da_r == 0.0) + if (da_i == 0) + zscal_kernel_8_zero(n1, x); + else + zscal_kernel_8_zero_r(n1, da_i, x); + else + if (da_i == 0) + zscal_kernel_8_zero_i(n1, da_r, x); + else + zscal_kernel_8(n1, da_r,da_i, x); - } + i = n1 << 1; + j = n1; + } + + + if (da_r == 0.0) { + + if (da_i == 0.0) { + + while (j < n) { + + x[i] = 0.0; + x[i + 1] = 0.0; + i += 2; + j++; + + } + + } else { + + while (j < n) { + + temp0 = -da_i * x[i + 1]; + x[i + 1] = da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } + + } } else { - while (j < n) { + if (da_i == 0.0) { - temp0 = da_r * x[i] - da_i * x[i + 1]; - x[i + 1] = da_r * x[i + 1] + da_i * x[i]; - x[i] = temp0; - i += 2; - j++; + while (j < n) { - } + temp0 = da_r * x[i]; + x[i + 1] = da_r * x[i + 1]; + x[i] = temp0; + i += 2; + j++; - } + } + + } else { + + while (j < n) { + + temp0 = da_r * x[i] - da_i * x[i + 1]; + x[i + 1] = da_r * x[i + 1] + da_i * x[i]; + x[i] = temp0; + i += 2; + j++; + + } - } + } + + } - return (0); + return (0); } + + diff --git a/kernel/zarch/zsum.c b/kernel/zarch/zsum.c deleted file mode 100644 index 7cfc1f17f..000000000 --- a/kernel/zarch/zsum.c +++ /dev/null @@ -1,136 +0,0 @@ -/*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - - -static FLOAT zsum_kernel_16(BLASLONG n, FLOAT *x) { - FLOAT sum; - - __asm__("vzero %%v24\n\t" - "vzero %%v25\n\t" - "vzero %%v26\n\t" - "vzero %%v27\n\t" - "vzero %%v28\n\t" - "vzero %%v29\n\t" - "vzero %%v30\n\t" - "vzero %%v31\n\t" - "srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 1, 1024(%%r1,%[x])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "vl %%v16, 128(%%r1,%[x])\n\t" - "vl %%v17, 144(%%r1,%[x])\n\t" - "vl %%v18, 160(%%r1,%[x])\n\t" - "vl %%v19, 176(%%r1,%[x])\n\t" - "vl %%v20, 192(%%r1,%[x])\n\t" - "vl %%v21, 208(%%r1,%[x])\n\t" - "vl %%v22, 224(%%r1,%[x])\n\t" - "vl %%v23, 240(%%r1,%[x])\n\t" - "vfadb %%v24,%%v24,%%v16\n\t" - "vfadb %%v25,%%v25,%%v17\n\t" - "vfadb %%v26,%%v26,%%v18\n\t" - "vfadb %%v27,%%v27,%%v19\n\t" - "vfadb %%v28,%%v28,%%v20\n\t" - "vfadb %%v29,%%v29,%%v21\n\t" - "vfadb %%v30,%%v30,%%v22\n\t" - "vfadb %%v31,%%v31,%%v23\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vfadb %%v24,%%v24,%%v26\n\t" - "vfadb %%v24,%%v24,%%v27\n\t" - "vfadb %%v24,%%v24,%%v28\n\t" - "vfadb %%v24,%%v24,%%v29\n\t" - "vfadb %%v24,%%v24,%%v30\n\t" - "vfadb %%v24,%%v24,%%v31\n\t" - "vrepg %%v25,%%v24,1\n\t" - "vfadb %%v24,%%v24,%%v25\n\t" - "vsteg %%v24,%[asum],0" - : [sum] "=Q"(sum),[n] "+&r"(n) - : "m"(*(const struct { FLOAT x[n * 2]; } *) x),[x] "a"(x) - : "cc", "r1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", - "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"); - - return sum; -} - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i = 0; - BLASLONG ip = 0; - FLOAT sumf = 0.0; - BLASLONG n1; - BLASLONG inc_x2; - - if (n <= 0 || inc_x <= 0) - return (sumf); - - if (inc_x == 1) { - - n1 = n & -16; - if (n1 > 0) { - - sumf = zsum_kernel_16(n1, x); - i = n1; - ip = 2 * n1; - } - - while (i < n) { - sumf += x[ip] + x[ip + 1]; - i++; - ip += 2; - } - - } else { - inc_x2 = 2 * inc_x; - - while (i < n) { - sumf += x[ip] + x[ip + 1]; - ip += inc_x2; - i++; - } - - } - return (sumf); -} diff --git a/kernel/zarch/zswap.c b/kernel/zarch/zswap.c index bc466866c..062079002 100644 --- a/kernel/zarch/zswap.c +++ b/kernel/zarch/zswap.c @@ -1,5 +1,5 @@ /*************************************************************************** -Copyright (c) 2013-2019, The OpenBLAS Project +Copyright (c) 2013-2017, The OpenBLAS Project All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are @@ -25,145 +25,286 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ + #include "common.h" -static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) { - __asm__("srlg %[n],%[n],4\n\t" - "xgr %%r1,%%r1\n\t" - "0:\n\t" - "pfd 2, 1024(%%r1,%[x])\n\t" - "pfd 2, 1024(%%r1,%[y])\n\t" - "vl %%v16, 0(%%r1,%[x])\n\t" - "vl %%v17, 16(%%r1,%[x])\n\t" - "vl %%v18, 32(%%r1,%[x])\n\t" - "vl %%v19, 48(%%r1,%[x])\n\t" - "vl %%v20, 64(%%r1,%[x])\n\t" - "vl %%v21, 80(%%r1,%[x])\n\t" - "vl %%v22, 96(%%r1,%[x])\n\t" - "vl %%v23, 112(%%r1,%[x])\n\t" - "vl %%v24, 128(%%r1,%[x])\n\t" - "vl %%v25, 144(%%r1,%[x])\n\t" - "vl %%v26, 160(%%r1,%[x])\n\t" - "vl %%v27, 176(%%r1,%[x])\n\t" - "vl %%v28, 192(%%r1,%[x])\n\t" - "vl %%v29, 208(%%r1,%[x])\n\t" - "vl %%v30, 224(%%r1,%[x])\n\t" - "vl %%v31, 240(%%r1,%[x])\n\t" - "vl %%v0, 0(%%r1,%[y])\n\t" - "vl %%v1, 16(%%r1,%[y])\n\t" - "vl %%v2, 32(%%r1,%[y])\n\t" - "vl %%v3, 48(%%r1,%[y])\n\t" - "vl %%v4, 64(%%r1,%[y])\n\t" - "vl %%v5, 80(%%r1,%[y])\n\t" - "vl %%v6, 96(%%r1,%[y])\n\t" - "vl %%v7, 112(%%r1,%[y])\n\t" - "vst %%v0, 0(%%r1,%[x])\n\t" - "vst %%v1, 16(%%r1,%[x])\n\t" - "vst %%v2, 32(%%r1,%[x])\n\t" - "vst %%v3, 48(%%r1,%[x])\n\t" - "vst %%v4, 64(%%r1,%[x])\n\t" - "vst %%v5, 80(%%r1,%[x])\n\t" - "vst %%v6, 96(%%r1,%[x])\n\t" - "vst %%v7, 112(%%r1,%[x])\n\t" - "vl %%v0, 128(%%r1,%[y])\n\t" - "vl %%v1, 144(%%r1,%[y])\n\t" - "vl %%v2, 160(%%r1,%[y])\n\t" - "vl %%v3, 176(%%r1,%[y])\n\t" - "vl %%v4, 192(%%r1,%[y])\n\t" - "vl %%v5, 208(%%r1,%[y])\n\t" - "vl %%v6, 224(%%r1,%[y])\n\t" - "vl %%v7, 240(%%r1,%[y])\n\t" - "vst %%v0, 128(%%r1,%[x])\n\t" - "vst %%v1, 144(%%r1,%[x])\n\t" - "vst %%v2, 160(%%r1,%[x])\n\t" - "vst %%v3, 176(%%r1,%[x])\n\t" - "vst %%v4, 192(%%r1,%[x])\n\t" - "vst %%v5, 208(%%r1,%[x])\n\t" - "vst %%v6, 224(%%r1,%[x])\n\t" - "vst %%v7, 240(%%r1,%[x])\n\t" - "vst %%v16, 0(%%r1,%[y])\n\t" - "vst %%v17, 16(%%r1,%[y])\n\t" - "vst %%v18, 32(%%r1,%[y])\n\t" - "vst %%v19, 48(%%r1,%[y])\n\t" - "vst %%v20, 64(%%r1,%[y])\n\t" - "vst %%v21, 80(%%r1,%[y])\n\t" - "vst %%v22, 96(%%r1,%[y])\n\t" - "vst %%v23, 112(%%r1,%[y])\n\t" - "vst %%v24, 128(%%r1,%[y])\n\t" - "vst %%v25, 144(%%r1,%[y])\n\t" - "vst %%v26, 160(%%r1,%[y])\n\t" - "vst %%v27, 176(%%r1,%[y])\n\t" - "vst %%v28, 192(%%r1,%[y])\n\t" - "vst %%v29, 208(%%r1,%[y])\n\t" - "vst %%v30, 224(%%r1,%[y])\n\t" - "vst %%v31, 240(%%r1,%[y])\n\t" - "agfi %%r1,256\n\t" - "brctg %[n],0b" - : "+m"(*(struct { FLOAT x[n * 2]; } *) x), - "+m"(*(struct { FLOAT x[n * 2]; } *) y),[n] "+&r"(n) - : [x] "a"(x),[y] "a"(y) - : "cc", "r1", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", - "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28", "v29", "v30", "v31"); + +#if defined(Z13_SWAP_A) +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 1, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v24, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 0(%%r1,%[ptr_x]) \n\t" + + "vl %%v25, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_x]) \n\t" + + "vl %%v26, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_x]) \n\t" + + "vl %%v27, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_x]) \n\t" + + "vl %%v28, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_x]) \n\t" + + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v16, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v16, 128(%%r1,%[ptr_x]) \n\t" + + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 144(%%r1,%[ptr_x]) \n\t" + + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 160(%%r1,%[ptr_x]) \n\t" + + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 176(%%r1,%[ptr_x]) \n\t" + + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 192(%%r1,%[ptr_x]) \n\t" + + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 208(%%r1,%[ptr_x]) \n\t" + + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 224(%%r1,%[ptr_x]) \n\t" + + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 240(%%r1,%[ptr_x]) \n\t" + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v16","v17","v18","v19","v20","v21","v22","v23" + ,"v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; + } -int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, - FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, - FLOAT *dummy, BLASLONG dummy2) { - BLASLONG i = 0; - BLASLONG ix = 0, iy = 0; - FLOAT temp[2]; - BLASLONG inc_x2, inc_y2; - - if (n <= 0) - return (0); - - if ((inc_x == 1) && (inc_y == 1)) { - - BLASLONG n1 = n & -16; - if (n1 > 0) { - zswap_kernel_16(n1, x, y); - i = n1; - ix = 2 * n1; - iy = 2 * n1; - } +#else - while (i < n) { +static void zswap_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y) +{ + __asm__ volatile( + "pfd 2, 0(%[ptr_x]) \n\t" + "pfd 2, 0(%[ptr_y]) \n\t" + "srlg %[n_tmp],%[n_tmp],4 \n\t" + "xgr %%r1,%%r1 \n\t" + ".align 16 \n\t" + "1: \n\t" + "pfd 2, 256(%%r1,%[ptr_x]) \n\t" + "pfd 2, 256(%%r1,%[ptr_y]) \n\t" + + "vl %%v16, 0(%%r1,%[ptr_x]) \n\t" + "vl %%v17, 16(%%r1,%[ptr_x]) \n\t" + "vl %%v18, 32(%%r1,%[ptr_x]) \n\t" + "vl %%v19, 48(%%r1,%[ptr_x]) \n\t" + "vl %%v20, 64(%%r1,%[ptr_x]) \n\t" + "vl %%v21, 80(%%r1,%[ptr_x]) \n\t" + "vl %%v22, 96(%%r1,%[ptr_x]) \n\t" + "vl %%v23, 112(%%r1,%[ptr_x]) \n\t" + "vl %%v24, 128(%%r1,%[ptr_x]) \n\t" + "vl %%v25, 144(%%r1,%[ptr_x]) \n\t" + "vl %%v26, 160(%%r1,%[ptr_x]) \n\t" + "vl %%v27, 176(%%r1,%[ptr_x]) \n\t" + "vl %%v28, 192(%%r1,%[ptr_x]) \n\t" + "vl %%v29, 208(%%r1,%[ptr_x]) \n\t" + "vl %%v30, 224(%%r1,%[ptr_x]) \n\t" + "vl %%v31, 240(%%r1,%[ptr_x]) \n\t" - temp[0] = x[ix]; - temp[1] = x[ix + 1]; - x[ix] = y[iy]; - x[ix + 1] = y[iy + 1]; - y[iy] = temp[0]; - y[iy + 1] = temp[1]; - ix += 2; - iy += 2; - i++; + "vl %%v0, 0(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 16(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 32(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 48(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 64(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 80(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 96(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 0(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 16(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 32(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 48(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 64(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 80(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 96(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 112(%%r1,%[ptr_x]) \n\t" - } + "vl %%v0, 128(%%r1,%[ptr_y]) \n\t" + "vl %%v1, 144(%%r1,%[ptr_y]) \n\t" + "vl %%v2, 160(%%r1,%[ptr_y]) \n\t" + "vl %%v3, 176(%%r1,%[ptr_y]) \n\t" + "vl %%v4, 192(%%r1,%[ptr_y]) \n\t" + "vl %%v5, 208(%%r1,%[ptr_y]) \n\t" + "vl %%v6, 224(%%r1,%[ptr_y]) \n\t" + "vl %%v7, 240(%%r1,%[ptr_y]) \n\t" + "vst %%v0, 128(%%r1,%[ptr_x]) \n\t" + "vst %%v1, 144(%%r1,%[ptr_x]) \n\t" + "vst %%v2, 160(%%r1,%[ptr_x]) \n\t" + "vst %%v3, 176(%%r1,%[ptr_x]) \n\t" + "vst %%v4, 192(%%r1,%[ptr_x]) \n\t" + "vst %%v5, 208(%%r1,%[ptr_x]) \n\t" + "vst %%v6, 224(%%r1,%[ptr_x]) \n\t" + "vst %%v7, 240(%%r1,%[ptr_x]) \n\t" + + "vst %%v16, 0(%%r1,%[ptr_y]) \n\t" + "vst %%v17, 16(%%r1,%[ptr_y]) \n\t" + "vst %%v18, 32(%%r1,%[ptr_y]) \n\t" + "vst %%v19, 48(%%r1,%[ptr_y]) \n\t" + "vst %%v20, 64(%%r1,%[ptr_y]) \n\t" + "vst %%v21, 80(%%r1,%[ptr_y]) \n\t" + "vst %%v22, 96(%%r1,%[ptr_y]) \n\t" + "vst %%v23, 112(%%r1,%[ptr_y]) \n\t" + "vst %%v24, 128(%%r1,%[ptr_y]) \n\t" + "vst %%v25, 144(%%r1,%[ptr_y]) \n\t" + "vst %%v26, 160(%%r1,%[ptr_y]) \n\t" + "vst %%v27, 176(%%r1,%[ptr_y]) \n\t" + "vst %%v28, 192(%%r1,%[ptr_y]) \n\t" + "vst %%v29, 208(%%r1,%[ptr_y]) \n\t" + "vst %%v30, 224(%%r1,%[ptr_y]) \n\t" + "vst %%v31, 240(%%r1,%[ptr_y]) \n\t" + + + "la %%r1,256(%%r1) \n\t" + "brctg %[n_tmp],1b" + : [mem_x] "+m" (*(double (*)[2*n])x), + [mem_y] "+m" (*(double (*)[2*n])y), + [n_tmp] "+&r"(n) + : [ptr_x] "a"(x), [ptr_y] "a"(y) + : "cc", "r1", "v0","v1","v2","v3","v4","v5","v6","v7","v16", + "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31" + ); + return; - } else { +} + +#endif + + + + + + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2, inc_y2; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1 )) + { + + BLASLONG n1 = n & -16; + if ( n1 > 0 ) + { + zswap_kernel_16(n1, x, y); + i=n1; + ix = 2* n1; + iy = 2* n1; + } - inc_x2 = 2 * inc_x; - inc_y2 = 2 * inc_y; + while(i < n) + { - while (i < n) { + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; - temp[0] = x[ix]; - temp[1] = x[ix + 1]; - x[ix] = y[iy]; - x[ix + 1] = y[iy + 1]; - y[iy] = temp[0]; - y[iy + 1] = temp[1]; + ix += 2 ; + iy += 2 ; + i++ ; + + + } - ix += inc_x2; - iy += inc_y2; - i++; } + else + { + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; - } - return (0); + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + + } + return(0); + } + + diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index c5ea465e0..6ded78c8b 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -70,11 +70,7 @@ /* Complex type (single precision) */ #ifndef lapack_complex_float -#ifndef __cplusplus #include -#else -#include -#endif #define lapack_complex_float float _Complex #endif @@ -90,11 +86,7 @@ lapack_complex_float lapack_make_complex_float( float re, float im ); /* Complex type (double precision) */ #ifndef lapack_complex_double -#ifndef __cplusplus #include -#else -#include -#endif #define lapack_complex_double double _Complex #endif diff --git a/lapack-netlib/TESTING/EIG/chet21.f b/lapack-netlib/TESTING/EIG/chet21.f index 5aff64904..8dbdb521e 100644 --- a/lapack-netlib/TESTING/EIG/chet21.f +++ b/lapack-netlib/TESTING/EIG/chet21.f @@ -304,8 +304,7 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL CHER2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/chpt21.f b/lapack-netlib/TESTING/EIG/chpt21.f index e151a8bd8..4b9279470 100644 --- a/lapack-netlib/TESTING/EIG/chpt21.f +++ b/lapack-netlib/TESTING/EIG/chpt21.f @@ -323,7 +323,7 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL CHPR2( CUPLO, N, -CMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhet21.f b/lapack-netlib/TESTING/EIG/zhet21.f index f6cb2d70a..32a09741e 100644 --- a/lapack-netlib/TESTING/EIG/zhet21.f +++ b/lapack-netlib/TESTING/EIG/zhet21.f @@ -304,8 +304,7 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL ZHER2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK, N ) 20 CONTINUE diff --git a/lapack-netlib/TESTING/EIG/zhpt21.f b/lapack-netlib/TESTING/EIG/zhpt21.f index ef9e4418d..f9268661a 100644 --- a/lapack-netlib/TESTING/EIG/zhpt21.f +++ b/lapack-netlib/TESTING/EIG/zhpt21.f @@ -323,8 +323,7 @@ 10 CONTINUE * IF( N.GT.1 .AND. KBAND.EQ.1 ) THEN -CMK DO 20 J = 1, N - 1 - DO 20 J = 2, N - 1 + DO 20 J = 1, N - 1 CALL ZHPR2( CUPLO, N, -DCMPLX( E( J ) ), U( 1, J ), 1, $ U( 1, J-1 ), 1, WORK ) 20 CONTINUE diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index d48a270ab..c0a7543ca 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -63,6 +63,7 @@ if (USE_THREAD) # these do not have 'z' versions set(PARALLEL_SOURCES + ${GETRF_SRC} lauum/lauum_U_parallel.c lauum/lauum_L_parallel.c potrf/potrf_U_parallel.c @@ -80,10 +81,6 @@ if (USE_THREAD) trtri/trtri_L_parallel.c ) - foreach (float_type ${FLOAT_TYPES}) - GenerateNamedObjects("${GETRF_SRC}" "" "getrf_parallel" false "" "" false ${float_type}) - endforeach() - GenerateNamedObjects("${PARALLEL_SOURCES}") endif () diff --git a/lapack/getrf/getrf_parallel.c b/lapack/getrf/getrf_parallel.c index c82defcab..591ce4a99 100644 --- a/lapack/getrf/getrf_parallel.c +++ b/lapack/getrf/getrf_parallel.c @@ -279,6 +279,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) #if 1 { + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE * bufferside]; @@ -365,6 +368,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * if ((current != mypos) && (!is)) { #if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; + UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[current].working[mypos][CACHE_LINE_SIZE * bufferside]; @@ -396,6 +402,9 @@ static int inner_advanced_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG * for (i = 0; i < args -> nthreads; i++) { for (xxx = 0; xxx < DIVIDE_RATE; xxx++) { #if 1 + LOCK_COMMAND(&getrf_lock); + jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; + UNLOCK_COMMAND(&getrf_lock); do { LOCK_COMMAND(&getrf_lock); jw = job[mypos].working[i][CACHE_LINE_SIZE *xxx]; diff --git a/param.h b/param.h index 5fbdbcdcd..15ea663a8 100644 --- a/param.h +++ b/param.h @@ -1999,7 +1999,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ZGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_N 2 -#if defined(OS_LINUX) || defined(OS_DARWIN) || defined(OS_FREEBSD) +#ifdef OS_LINUX #if L2_SIZE == 1024976 #define SGEMM_DEFAULT_P 320 #define DGEMM_DEFAULT_P 256 @@ -2230,37 +2230,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(POWER9) - -#define SNUMOPT 16 -#define DNUMOPT 8 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 65536 -#define GEMM_DEFAULT_ALIGN 0x0ffffUL - -#define SGEMM_DEFAULT_UNROLL_M 16 -#define SGEMM_DEFAULT_UNROLL_N 8 -#define DGEMM_DEFAULT_UNROLL_M 16 -#define DGEMM_DEFAULT_UNROLL_N 4 -#define CGEMM_DEFAULT_UNROLL_M 8 -#define CGEMM_DEFAULT_UNROLL_N 4 -#define ZGEMM_DEFAULT_UNROLL_M 8 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 832 -#define DGEMM_DEFAULT_P 128 -#define CGEMM_DEFAULT_P 512 -#define ZGEMM_DEFAULT_P 256 - -#define SGEMM_DEFAULT_Q 1026 -#define DGEMM_DEFAULT_Q 384 -#define CGEMM_DEFAULT_Q 1026 -#define ZGEMM_DEFAULT_Q 1026 - -#define SYMV_P 8 - -#endif #if defined(SPARC) && defined(V7) @@ -2622,7 +2591,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(CORTEXA53) || defined(CORTEXA57) || \ defined(CORTEXA72) || defined(CORTEXA73) || \ - defined(FALKOR) || defined(TSV110) + defined(FALKOR) #define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2946,46 +2915,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(Z14) -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 8 -#define SGEMM_DEFAULT_UNROLL_N 4 - -#define DGEMM_DEFAULT_UNROLL_M 8 -#define DGEMM_DEFAULT_UNROLL_N 4 - -#define CGEMM_DEFAULT_UNROLL_M 4 -#define CGEMM_DEFAULT_UNROLL_N 4 - -#define ZGEMM_DEFAULT_UNROLL_M 4 -#define ZGEMM_DEFAULT_UNROLL_N 4 - -#define SGEMM_DEFAULT_P 456 -#define DGEMM_DEFAULT_P 320 -#define CGEMM_DEFAULT_P 480 -#define ZGEMM_DEFAULT_P 224 - -#define SGEMM_DEFAULT_Q 488 -#define DGEMM_DEFAULT_Q 384 -#define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 352 - -#define SGEMM_DEFAULT_R 8192 -#define DGEMM_DEFAULT_R 4096 -#define CGEMM_DEFAULT_R 4096 -#define ZGEMM_DEFAULT_R 2048 - - -#define SYMV_P 16 -#endif - - #ifdef GENERIC diff --git a/relapack/config.h b/relapack/config.h index e4fab0a12..9113a712d 100644 --- a/relapack/config.h +++ b/relapack/config.h @@ -36,8 +36,8 @@ // allow malloc in xsygst for improved performance #define XSYGST_ALLOW_MALLOC ALLOW_MALLOC // allow malloc in xsytrf if the passed work buffer is too small -//#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC -#define XSYTRF_ALLOW_MALLOC 0 +#define XSYTRF_ALLOW_MALLOC ALLOW_MALLOC + //////////////////////////////// // LAPACK routine replacement // diff --git a/relapack/inc/relapack.h b/relapack/inc/relapack.h index 7f283e04d..e421f352b 100644 --- a/relapack/inc/relapack.h +++ b/relapack/inc/relapack.h @@ -1,79 +1,67 @@ #ifndef RELAPACK_H #define RELAPACK_H -#ifdef USE64BITINT - typedef BLASLONG blasint; - #if defined(OS_WINDOWS) && defined(__64BIT__) - #define blasabs(x) llabs(x) - #else - #define blasabs(x) labs(x) - #endif -#else - typedef int blasint; - #define blasabs(x) abs(x) -#endif +void RELAPACK_slauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_dlauum(const char *, const int *, double *, const int *, int *); +void RELAPACK_clauum(const char *, const int *, float *, const int *, int *); +void RELAPACK_zlauum(const char *, const int *, double *, const int *, int *); -void RELAPACK_slauum(const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_dlauum(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_clauum(const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_zlauum(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_strtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_dtrtri(const char *, const char *, const int *, double *, const int *, int *); +void RELAPACK_ctrtri(const char *, const char *, const int *, float *, const int *, int *); +void RELAPACK_ztrtri(const char *, const char *, const int *, double *, const int *, int *); -void RELAPACK_strtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_dtrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_ctrtri(const char *, const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_ztrtri(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_spotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_dpotrf(const char *, const int *, double *, const int *, int *); +void RELAPACK_cpotrf(const char *, const int *, float *, const int *, int *); +void RELAPACK_zpotrf(const char *, const int *, double *, const int *, int *); -void RELAPACK_spotrf(const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_dpotrf(const char *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_cpotrf(const char *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_zpotrf(const char *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_spbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_dpbtrf(const char *, const int *, const int *, double *, const int *, int *); +void RELAPACK_cpbtrf(const char *, const int *, const int *, float *, const int *, int *); +void RELAPACK_zpbtrf(const char *, const int *, const int *, double *, const int *, int *); -void RELAPACK_spbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_dpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -void RELAPACK_cpbtrf(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); -void RELAPACK_zpbtrf(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook(const char *, const int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook(const char *, const int *, double *, const int *, int *, double *, const int *, int *); -void RELAPACK_ssytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_dsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_csytrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_chetrf(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_zsytrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_zhetrf(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_ssytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_dsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_csytrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_chetrf_rook(const char *, const blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_zsytrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_zhetrf_rook(const char *, const blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_sgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgetrf(const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgetrf(const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgetrf(const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_sgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_dgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_cgetrf(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_zgetrf(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_sgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_dgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +void RELAPACK_cgbtrf(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +void RELAPACK_zgbtrf(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -void RELAPACK_sgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_dgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_cgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_zgbtrf(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +void RELAPACK_ssygst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_dsygst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +void RELAPACK_chegst(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +void RELAPACK_zhegst(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -void RELAPACK_ssygst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); -void RELAPACK_dsygst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -void RELAPACK_chegst(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); -void RELAPACK_zhegst(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +void RELAPACK_strsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); -void RELAPACK_strsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); -void RELAPACK_dtrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); -void RELAPACK_ctrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); -void RELAPACK_ztrsyl(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_stgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_dtgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); +void RELAPACK_ctgsyl(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, const int *, int *, int *); +void RELAPACK_ztgsyl(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, const int *, int *, int *); -void RELAPACK_stgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_dtgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); -void RELAPACK_ctgsyl(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, const blasint *, blasint *, blasint *); -void RELAPACK_ztgsyl(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, const blasint *, blasint *, blasint *); - -void RELAPACK_sgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -void RELAPACK_dgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); -void RELAPACK_cgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -void RELAPACK_zgemmt(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +void RELAPACK_sgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_dgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +void RELAPACK_cgemmt(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +void RELAPACK_zgemmt(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); #endif /* RELAPACK_H */ diff --git a/relapack/src/blas.h b/relapack/src/blas.h index 6d9f1a42a..7441c1033 100644 --- a/relapack/src/blas.h +++ b/relapack/src/blas.h @@ -1,61 +1,61 @@ #ifndef BLAS_H #define BLAS_H -extern void BLAS(sswap)(const blasint *, float *, const blasint *, float *, const blasint *); -extern void BLAS(dswap)(const blasint *, double *, const blasint *, double *, const blasint *); -extern void BLAS(cswap)(const blasint *, float *, const blasint *, float *, const blasint *); -extern void BLAS(zswap)(const blasint *, double *, const blasint *, double *, const blasint *); - -extern void BLAS(sscal)(const blasint *, const float *, float *, const blasint *); -extern void BLAS(dscal)(const blasint *, const double *, double *, const blasint *); -extern void BLAS(cscal)(const blasint *, const float *, float *, const blasint *); -extern void BLAS(zscal)(const blasint *, const double *, double *, const blasint *); - -extern void BLAS(saxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(daxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); -extern void BLAS(caxpy)(const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(zaxpy)(const blasint *, const double *, const double *, const blasint *, double *, const blasint *); - -extern void BLAS(sgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(dgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); -extern void BLAS(cgemv)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(zgemv)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); - -extern void BLAS(sgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(dgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); -extern void BLAS(cgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(zgemm)(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); - -extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); -extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); - -extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); -extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, float *, const blasint *); -extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, double *, const blasint *); - -extern void BLAS(ssyrk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(dsyrk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); -extern void BLAS(cherk)(const char *, const char *, const blasint *, const blasint *, const float *, float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(zherk)(const char *, const char *, const blasint *, const blasint *, const double *, double *, const blasint *, const double *, double *, const blasint *); - -extern void BLAS(ssymm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(dsymm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); -extern void BLAS(chemm)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(zhemm)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); - -extern void BLAS(ssyr2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(dsyr2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); -extern void BLAS(cher2k)(const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, float *, const blasint *); -extern void BLAS(zher2k)(const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, double *, const blasint *); +extern void BLAS(sswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(dswap)(const int *, double *, const int *, double *, const int *); +extern void BLAS(cswap)(const int *, float *, const int *, float *, const int *); +extern void BLAS(zswap)(const int *, double *, const int *, double *, const int *); + +extern void BLAS(sscal)(const int *, const float *, float *, const int *); +extern void BLAS(dscal)(const int *, const double *, double *, const int *); +extern void BLAS(cscal)(const int *, const float *, float *, const int *); +extern void BLAS(zscal)(const int *, const double *, double *, const int *); + +extern void BLAS(saxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(daxpy)(const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(caxpy)(const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(zaxpy)(const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(sgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemv)(const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemv)(const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(sgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemm)(const char *, const char *, const int *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemm)(const char *, const char *, const int *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); + +extern void BLAS(strsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrsm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(strmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(dtrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); +extern void BLAS(ctrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, float *, const int *); +extern void BLAS(ztrmm)(const char *, const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, double *, const int *); + +extern void BLAS(ssyrk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyrk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); +extern void BLAS(cherk)(const char *, const char *, const int *, const int *, const float *, float *, const int *, const float *, float *, const int *); +extern void BLAS(zherk)(const char *, const char *, const int *, const int *, const double *, double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssymm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsymm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(chemm)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zhemm)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); + +extern void BLAS(ssyr2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(dsyr2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); +extern void BLAS(cher2k)(const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, float *, const int *); +extern void BLAS(zher2k)(const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, double *, const int *); #if HAVE_XGEMMT -extern void BLAS(sgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(dgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); -extern void BLAS(cgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const float *, const blasint *, const float *, const float *, const blasint*); -extern void BLAS(zgemmt)(const char *, const char *, const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const double *, const blasint *, const double *, const double *, const blasint*); +extern void BLAS(sgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(dgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); +extern void BLAS(cgemmt)(const char *, const char *, const char *, const int *, const int *, const float *, const float *, const int *, const float *, const int *, const float *, const float *, const int*); +extern void BLAS(zgemmt)(const char *, const char *, const char *, const int *, const int *, const double *, const double *, const int *, const double *, const int *, const double *, const double *, const int*); #endif #endif /* BLAS_H */ diff --git a/relapack/src/cgbtrf.c b/relapack/src/cgbtrf.c index 61332c6a6..90b2c8789 100644 --- a/relapack/src/cgbtrf.c +++ b/relapack/src/cgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cgbtrf_rec(const blasint *, const blasint *, const blasint *, - const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_cgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); /** CGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_cgbtrf_rec(const blasint *, const blasint *, const blasint * http://www.netlib.org/lapack/explore-html/d0/d3a/cgbtrf_8f.html * */ void RELAPACK_cgbtrf( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_cgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CGBTRF", &minfo, strlen("CGBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("CGBTRF", &minfo); return; } @@ -40,14 +40,14 @@ void RELAPACK_cgbtrf( const float ZERO[] = { 0., 0. }; // Result upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - blasint i, j; + int i, j; for (j = 0; j < *n; j++) { float *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_cgbtrf( } // Allocate work space - const blasint n1 = CREC_SPLIT(*n); - const blasint mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; - const blasint nWorkl = (kv > n1) ? n1 : kv; - const blasint mWorku = (*kl > n1) ? n1 : *kl; - const blasint nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; + const int n1 = CREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(float)); float *Worku = malloc(mWorku * nWorku * 2 * sizeof(float)); LAPACK(claset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_cgbtrf( /** cgbtrf's recursive compute kernel */ static void RELAPACK_cgbtrf_rec( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info ) { if (*n <= MAX(CROSSOVER_CGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_cgbtrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterators - blasint i, j; + int i, j; // Output upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * kv; // Splitting - const blasint n1 = MIN(CREC_SPLIT(*n), *kl); - const blasint n2 = *n - n1; - const blasint m1 = MIN(n1, *m); - const blasint m2 = *m - m1; - const blasint mn1 = MIN(m1, n1); - const blasint mn2 = MIN(m2, n2); + const int n1 = MIN(CREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_cgbtrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // Banded splitting - const blasint n21 = MIN(n2, kv - n1); - const blasint n22 = MIN(n2 - n21, n1); - const blasint m21 = MIN(m2, *kl - m1); - const blasint m22 = MIN(m2 - m21, m1); + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_cgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_cgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { const float tmpr = A_Rrj[2 * i]; const float tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_cgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(cswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -221,9 +221,7 @@ static void RELAPACK_cgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - //RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(cgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); - + RELAPACK_cgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/cgemmt.c b/relapack/src/cgemmt.c index 3af4d790f..28e2b00b0 100644 --- a/relapack/src/cgemmt.c +++ b/relapack/src/cgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_cgemmt_rec(const char *, const char *, const char *, - const blasint *, const blasint *, const float *, const float *, const blasint *, - const float *, const blasint *, const float *, float *, const blasint *); + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, - const blasint *, const blasint *, const float *, const float *, const blasint *, - const float *, const blasint *, const float *, float *, const blasint *); + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); /** CGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_cgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_cgemmt( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_cgemmt( #else // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint notransA = LAPACK(lsame)(transA, "N"); - const blasint tranA = LAPACK(lsame)(transA, "T"); - const blasint ctransA = LAPACK(lsame)(transA, "C"); - const blasint notransB = LAPACK(lsame)(transB, "N"); - const blasint tranB = LAPACK(lsame)(transB, "T"); - const blasint ctransB = LAPACK(lsame)(transB, "C"); - blasint info = 0; + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_cgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("CGEMMT", &info, strlen("CGEMMT")); + LAPACK(xerbla)("CGEMMT", &info); return; } @@ -76,10 +76,10 @@ void RELAPACK_cgemmt( /** cgemmt's recursive compute kernel */ static void RELAPACK_cgemmt_rec( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { if (*n <= MAX(CROSSOVER_CGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_cgemmt_rec( } // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_cgemmt_rec( /** cgemmt's unblocked compute kernel */ static void RELAPACK_cgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { - const blasint incB = (*transB == 'N') ? 1 : *ldB; - const blasint incC = 1; + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; - blasint i; + int i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_cgemmt_rec2( float *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const blasint nmi = *n - i; + const int nmi = *n - i; if (*transA == 'N') BLAS(cgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(cgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const blasint ip1 = i + 1; + const int ip1 = i + 1; if (*transA == 'N') BLAS(cgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/cgetrf.c b/relapack/src/cgetrf.c index 878c9ec15..b31a711d0 100644 --- a/relapack/src/cgetrf.c +++ b/relapack/src/cgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cgetrf_rec(const blasint *, const blasint *, float *, - const blasint *, blasint *, blasint *); +static void RELAPACK_cgetrf_rec(const int *, const int *, float *, + const int *, int *, int *); /** CGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_cgetrf_rec(const blasint *, const blasint *, float *, * http://www.netlib.org/lapack/explore-html/d9/dfb/cgetrf_8f.html */ void RELAPACK_cgetrf( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { // Check arguments @@ -22,15 +22,15 @@ void RELAPACK_cgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *m)) + else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CGETRF", &minfo, strlen("CGETRF")); + const int minfo = -*info; + LAPACK(xerbla)("CGETRF", &minfo); return; } - const blasint sn = MIN(*m, *n); + const int sn = MIN(*m, *n); RELAPACK_cgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_cgetrf( if (*m < *n) { // Constants const float ONE[] = { 1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Splitting - const blasint rn = *n - *m; + const int rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_cgetrf( /** cgetrf's recursive compute kernel */ static void RELAPACK_cgetrf_rec( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { if (*n <= MAX(CROSSOVER_CGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_cgetrf_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; - const blasint m2 = *m - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; // A_L A_R float *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_cgetrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_cgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_cgetrf_rec( // apply pivots to A_BL LAPACK(claswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/chegst.c b/relapack/src/chegst.c index fe77b03ea..dff875017 100644 --- a/relapack/src/chegst.c +++ b/relapack/src/chegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_chegst_rec(const blasint *, const char *, const blasint *, - float *, const blasint *, const float *, const blasint *, - float *, const blasint *, blasint *); +static void RELAPACK_chegst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); /** CHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_chegst_rec(const blasint *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d7/d2a/chegst_8f.html * */ void RELAPACK_chegst( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_chegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CHEGST", &minfo, strlen("CHEGST")); + const int minfo = -*info; + LAPACK(xerbla)("CHEGST", &minfo); return; } @@ -45,9 +45,9 @@ void RELAPACK_chegst( // Allocate work space float *Work = NULL; - blasint lWork = 0; + int lWork = 0; #if XSYGST_ALLOW_MALLOC - const blasint n1 = CREC_SPLIT(*n); + const int n1 = CREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_chegst( /** chegst's recursive compute kernel */ static void RELAPACK_chegst_rec( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *Work, const blasint *lWork, blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info ) { if (*n <= MAX(CROSSOVER_CHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_chegst_rec( const float MONE[] = { -1., 0. }; const float HALF[] = { .5, 0. }; const float MHALF[] = { -.5, 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/chetrf.c b/relapack/src/chetrf.c index 8cd3c0774..2928235e4 100644 --- a/relapack/src/chetrf.c +++ b/relapack/src/chetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); +static void RELAPACK_chetrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** CHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/da/dc1/chetrf_8f.html * */ void RELAPACK_chetrf( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_chetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf( /** chetrf's recursive compute kernel */ static void RELAPACK_chetrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = CREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_chetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_chetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = CREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_chetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_chetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rec2.c b/relapack/src/chetrf_rec2.c index 412f64cf7..b5c8341b6 100644 --- a/relapack/src/chetrf_rec2.c +++ b/relapack/src/chetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static blasint c__1 = 1; +static int c__1 = 1; /** CHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, - int *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3, q__4; @@ -38,22 +38,22 @@ static blasint c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static blasint j, k; + static int j, k; static float t, r1; static complex d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * - , complex *, blasint *, complex *, blasint *, complex *, complex * - , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, - complex *, blasint *), cswap_(int *, complex *, blasint *, - complex *, blasint *); - static blasint kstep; + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; static float absakk; - extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); - extern blasint icamax_(int *, complex *, blasint *); - extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/chetrf_rook.c b/relapack/src/chetrf_rook.c index 3d2fa3216..086393d57 100644 --- a/relapack/src/chetrf_rook.c +++ b/relapack/src/chetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_chetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); +static void RELAPACK_chetrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** CHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_chetrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/d0/d5e/chetrf__rook_8f.html * */ void RELAPACK_chetrf_rook( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_chetrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CHETRF", &minfo, strlen("CHETRF")); + const int minfo = -*info; + LAPACK(xerbla)("CHETRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_chetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_chetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_chetrf_rook( /** chetrf_rook's recursive compute kernel */ static void RELAPACK_chetrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_CHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_chetrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = CREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_chetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_chetrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_chetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = CREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_chetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_chetrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_chetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/chetrf_rook_rec2.c b/relapack/src/chetrf_rook_rec2.c index e0b2ff962..a42cbfd44 100644 --- a/relapack/src/chetrf_rook_rec2.c +++ b/relapack/src/chetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static blasint c__1 = 1; +static int c__1 = 1; /** CHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, - complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_chetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4, q__5; @@ -38,29 +38,29 @@ static blasint c__1 = 1; void r_cnjg(complex *, complex *), c_div(complex *, complex *, complex *); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static float t, r1; static complex d11, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * - , complex *, blasint *, complex *, blasint *, complex *, complex * - , blasint *, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); static float sfmin; - extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, - complex *, blasint *); - static blasint itemp; - extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, - complex *, blasint *); - static blasint kstep; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; static float stemp, absakk; - extern /* Subroutine */ blasint clacgv_(int *, complex *, blasint *); - extern blasint icamax_(int *, complex *, blasint *); + extern /* Subroutine */ int clacgv_(int *, complex *, int *); + extern int icamax_(int *, complex *, int *); extern double slamch_(char *, ftnlen); - extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int + extern /* Subroutine */ int csscal_(int *, float *, complex *, int *); static float colmax, rowmax; diff --git a/relapack/src/clauum.c b/relapack/src/clauum.c index 2bc93f182..36d6297cf 100644 --- a/relapack/src/clauum.c +++ b/relapack/src/clauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_clauum_rec(const char *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_clauum_rec(const char *, const int *, float *, + const int *, int *); /** CLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_clauum_rec(const char *, const blasint *, float *, * http://www.netlib.org/lapack/explore-html/d2/d36/clauum_8f.html * */ void RELAPACK_clauum( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_clauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CLAUUM", &minfo, strlen("CLAUUM")); + const int minfo = -*info; + LAPACK(xerbla)("CLAUUM", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_clauum( /** clauum's recursive compute kernel */ static void RELAPACK_clauum_rec( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_CLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_clauum_rec( const float ONE[] = { 1., 0. }; // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/cpbtrf.c b/relapack/src/cpbtrf.c index 971e547c6..e0ea7b944 100644 --- a/relapack/src/cpbtrf.c +++ b/relapack/src/cpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_cpbtrf_rec(const char *, const blasint *, const blasint *, - float *, const blasint *, float *, const blasint *, blasint *); +static void RELAPACK_cpbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); /** CPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_cpbtrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/de/d2d/cpbtrf_8f.html * */ void RELAPACK_cpbtrf( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_cpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CPBTRF", &minfo, strlen("CPBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("CPBTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpbtrf( const float ZERO[] = { 0., 0. }; // Allocate work space - const blasint n1 = CREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const int n1 = CREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * 2 * sizeof(float)); LAPACK(claset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_cpbtrf( /** cpbtrf's recursive compute kernel */ static void RELAPACK_cpbtrf_rec( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - float *Work, const blasint *ldWork, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info ){ if (*n <= MAX(CROSSOVER_CPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_cpbtrf_rec( const float MONE[] = { -1., 0. }; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const blasint n1 = MIN(CREC_SPLIT(*n), *kd); - const blasint n2 = *n - n1; + const int n1 = MIN(CREC_SPLIT(*n), *kd); + const int n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_cpbtrf_rec( return; // Banded splitting - const blasint n21 = MIN(n2, *kd - n1); - const blasint n22 = MIN(n2 - n21, *kd); + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/cpotrf.c b/relapack/src/cpotrf.c index 0f8e7ebb0..e35caa7fa 100644 --- a/relapack/src/cpotrf.c +++ b/relapack/src/cpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_cpotrf_rec(const char *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_cpotrf_rec(const char *, const int *, float *, + const int *, int *); /** CPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_cpotrf_rec(const char *, const blasint *, float *, * http://www.netlib.org/lapack/explore-html/dd/dce/cpotrf_8f.html * */ void RELAPACK_cpotrf( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_cpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CPOTRF", &minfo, strlen("CPOTRF")); + const int minfo = -*info; + LAPACK(xerbla)("CPOTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_cpotrf( /** cpotrf's recursive compute kernel */ static void RELAPACK_cpotrf_rec( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_CPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_cpotrf_rec( const float MONE[] = { -1., 0. }; // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/csytrf.c b/relapack/src/csytrf.c index 2ebc31001..01c161d1a 100644 --- a/relapack/src/csytrf.c +++ b/relapack/src/csytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); +static void RELAPACK_csytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** CSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/d5/d21/csytrf_8f.html * */ void RELAPACK_csytrf( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - blasint nout; + int nout; // Recursive kernel RELAPACK_csytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf( /** csytrf's recursive compute kernel */ static void RELAPACK_csytrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_csytrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = CREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_csytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_csytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_csytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_csytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = CREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_csytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_csytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_csytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rec2.c b/relapack/src/csytrf_rec2.c index 216a9e248..9d6bd849d 100644 --- a/relapack/src/csytrf_rec2.c +++ b/relapack/src/csytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static blasint c__1 = 1; +static int c__1 = 1; /** CSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, complex *w, - int *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rec2(char *uplo, int *n, int * + nb, int *kb, complex *a, int *lda, int *ipiv, complex *w, + int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2, r__3, r__4; complex q__1, q__2, q__3; @@ -38,21 +38,21 @@ static blasint c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static blasint j, k; + static int j, k; static complex t, r1, d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; - extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, - blasint *); + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * - , complex *, blasint *, complex *, blasint *, complex *, complex * - , blasint *, ftnlen), ccopy_(int *, complex *, blasint *, - complex *, blasint *), cswap_(int *, complex *, blasint *, - complex *, blasint *); - static blasint kstep; + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen), ccopy_(int *, complex *, int *, + complex *, int *), cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; static float absakk; - extern blasint icamax_(int *, complex *, blasint *); + extern int icamax_(int *, complex *, int *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/csytrf_rook.c b/relapack/src/csytrf_rook.c index e8a9865cc..aa7dd0e57 100644 --- a/relapack/src/csytrf_rook.c +++ b/relapack/src/csytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_csytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); +static void RELAPACK_csytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** CSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_csytrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/d8/dc8/csytrf__rook_8f.html * */ void RELAPACK_csytrf_rook( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_csytrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CSYTRF", &minfo, strlen("CSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("CSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_csytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_csytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_csytrf_rook( /** csytrf_rook's recursive compute kernel */ static void RELAPACK_csytrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_CSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_csytrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = CREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = CREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + 2 * n1; float *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_cgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(cgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_csytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_csytrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_csytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = CREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = CREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_csytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_csytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_cgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(cgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_csytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(cgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/csytrf_rook_rec2.c b/relapack/src/csytrf_rook_rec2.c index 2561065d7..6638338a6 100644 --- a/relapack/src/csytrf_rook_rec2.c +++ b/relapack/src/csytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static complex c_b1 = {1.f,0.f}; -static blasint c__1 = 1; +static int c__1 = 1; /** CSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, complex *a, blasint *lda, blasint *ipiv, - complex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_csytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, complex *a, int *lda, int *ipiv, + complex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -38,27 +38,27 @@ static blasint c__1 = 1; void c_div(complex *, complex *, complex *); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static complex t, r1, d11, d12, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static float alpha; - extern /* Subroutine */ blasint cscal_(int *, complex *, complex *, - blasint *); + extern /* Subroutine */ int cscal_(int *, complex *, complex *, + int *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint cgemv_(char *, blasint *, blasint *, complex * - , complex *, blasint *, complex *, blasint *, complex *, complex * - , blasint *, ftnlen); + extern /* Subroutine */ int cgemv_(char *, int *, int *, complex * + , complex *, int *, complex *, int *, complex *, complex * + , int *, ftnlen); static float sfmin; - extern /* Subroutine */ blasint ccopy_(int *, complex *, blasint *, - complex *, blasint *); - static blasint itemp; - extern /* Subroutine */ blasint cswap_(int *, complex *, blasint *, - complex *, blasint *); - static blasint kstep; + extern /* Subroutine */ int ccopy_(int *, complex *, int *, + complex *, int *); + static int itemp; + extern /* Subroutine */ int cswap_(int *, complex *, int *, + complex *, int *); + static int kstep; static float stemp, absakk; - extern blasint icamax_(int *, complex *, blasint *); + extern int icamax_(int *, complex *, int *); extern double slamch_(char *, ftnlen); static float colmax, rowmax; diff --git a/relapack/src/ctgsyl.c b/relapack/src/ctgsyl.c index 704f3ef23..15c738baf 100644 --- a/relapack/src/ctgsyl.c +++ b/relapack/src/ctgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ctgsyl_rec(const char *, const blasint *, const blasint *, - const blasint *, const float *, const blasint *, const float *, const blasint *, - float *, const blasint *, const float *, const blasint *, const float *, - const blasint *, float *, const blasint *, float *, float *, float *, blasint *); +static void RELAPACK_ctgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *); /** CTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ctgsyl_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/d7/de7/ctgsyl_8f.html * */ void RELAPACK_ctgsyl( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dif, - float *Work, const blasint *lWork, blasint *iWork, blasint *info + float *Work, const int *lWork, int *iWork, int *info ) { // Parse arguments - const blasint notran = LAPACK(lsame)(trans, "N"); - const blasint tran = LAPACK(lsame)(trans, "C"); + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - blasint lwmin = 1; + int lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ctgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CTGSYL", &minfo, strlen("CTGSYL")); + const int minfo = -*info; + LAPACK(xerbla)("CTGSYL", &minfo); return; } @@ -74,8 +74,8 @@ void RELAPACK_ctgsyl( // Constant const float ZERO[] = { 0., 0. }; - blasint isolve = 1; - blasint ifunc = 0; + int isolve = 1; + int ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ctgsyl( } float scale2; - blasint iround; + int iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ctgsyl( /** ctgsyl's recursive vompute kernel */ static void RELAPACK_ctgsyl_rec( - const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dsum, float *dscale, - blasint *info + int *info ) { if (*m <= MAX(CROSSOVER_CTGSYL, 1) && *n <= MAX(CROSSOVER_CTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ctgsyl_rec( // Constants const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - const blasint m1 = CREC_SPLIT(*m); - const blasint m2 = *m - m1; + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ctgsyl_rec( } } else { // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl.c b/relapack/src/ctrsyl.c index fed6e847e..b548d5354 100644 --- a/relapack/src/ctrsyl.c +++ b/relapack/src/ctrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ctrsyl_rec(const char *, const char *, const blasint *, - const blasint *, const blasint *, const float *, const blasint *, const float *, - const blasint *, float *, const blasint *, float *, blasint *); +static void RELAPACK_ctrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); /** CTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ctrsyl_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d8/df4/ctrsyl_8f.html * */ void RELAPACK_ctrsyl( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { // Check arguments - const blasint notransA = LAPACK(lsame)(tranA, "N"); - const blasint ctransA = LAPACK(lsame)(tranA, "C"); - const blasint notransB = LAPACK(lsame)(tranB, "N"); - const blasint ctransB = LAPACK(lsame)(tranB, "C"); + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ctrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CTRSYL", &minfo, strlen("CTRSYL")); + const int minfo = -*info; + LAPACK(xerbla)("CTRSYL", &minfo); return; } @@ -58,11 +58,11 @@ void RELAPACK_ctrsyl( /** ctrsyl's recursive compute kernel */ static void RELAPACK_ctrsyl_rec( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { if (*m <= MAX(CROSSOVER_CTRSYL, 1) && *n <= MAX(CROSSOVER_CTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ctrsyl_rec( const float ONE[] = { 1., 0. }; const float MONE[] = { -1., 0. }; const float MSGN[] = { -*isgn, 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs float scale1[] = { 1., 0. }; float scale2[] = { 1., 0. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - const blasint m1 = CREC_SPLIT(*m); - const blasint m2 = *m - m1; + const int m1 = CREC_SPLIT(*m); + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ctrsyl_rec( } } else { // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ctrsyl_rec2.c b/relapack/src/ctrsyl_rec2.c index 556491c7a..518574868 100644 --- a/relapack/src/ctrsyl_rec2.c +++ b/relapack/src/ctrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -complex cdotu_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { - extern void cdotu_(complex *, blasint *, complex *, blasint *, complex *, blasint *); +complex cdotu_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotu_(complex *, int *, complex *, int *, complex *, int *); complex result; cdotu_(&result, n, x, incx, y, incy); return result; } #define cdotu_ cdotu_fun -complex cdotc_fun(int *n, complex *x, blasint *incx, complex *y, blasint *incy) { - extern void cdotc_(complex *, blasint *, complex *, blasint *, complex *, blasint *); +complex cdotc_fun(int *n, complex *x, int *incx, complex *y, int *incy) { + extern void cdotc_(complex *, int *, complex *, int *, complex *, int *); complex result; cdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ complex cladiv_fun(complex *a, complex *b) { /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; /** RELAPACK_CTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static blasint c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ctrsyl_rec2(char *trana, char *tranb, int - *isgn, blasint *m, blasint *n, complex *a, blasint *lda, complex *b, - int *ldb, complex *c__, blasint *ldc, float *scale, blasint *info, + *isgn, int *m, int *n, complex *a, int *lda, complex *b, + int *ldb, complex *c__, int *ldc, float *scale, int *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; complex q__1, q__2, q__3, q__4; @@ -66,7 +66,7 @@ static blasint c__1 = 1; void r_cnjg(complex *, complex *); /* Local variables */ - static blasint j, k, l; + static int j, k, l; static complex a11; static float db; static complex x11; @@ -75,20 +75,20 @@ static blasint c__1 = 1; static float dum[1], eps, sgn, smin; static complex suml, sumr; /* Complex */ complex cdotc_(int *, complex *, int - *, complex *, blasint *); - extern blasint lsame_(char *, char *, ftnlen, ftnlen); + *, complex *, int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); /* Complex */ complex cdotu_(int *, complex *, int - *, complex *, blasint *); - extern /* Subroutine */ blasint slabad_(float *, float *); - extern float clange_(char *, blasint *, blasint *, complex *, - blasint *, float *, ftnlen); + *, complex *, int *); + extern /* Subroutine */ int slabad_(float *, float *); + extern float clange_(char *, int *, int *, complex *, + int *, float *, ftnlen); /* Complex */ complex cladiv_(complex *, complex *); static float scaloc; extern float slamch_(char *, ftnlen); - extern /* Subroutine */ blasint csscal_(int *, float *, complex *, int - *), xerbla_(char *, blasint *, ftnlen); + extern /* Subroutine */ int csscal_(int *, float *, complex *, int + *), xerbla_(char *, int *, ftnlen); static float bignum; - static blasint notrna, notrnb; + static int notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ctrtri.c b/relapack/src/ctrtri.c index 5201a24c7..0262cb59d 100644 --- a/relapack/src/ctrtri.c +++ b/relapack/src/ctrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ctrtri_rec(const char *, const char *, const blasint *, - float *, const blasint *, blasint *); +static void RELAPACK_ctrtri_rec(const char *, const char *, const int *, + float *, const int *, int *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ctrtri_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/df/df8/ctrtri_8f.html * */ void RELAPACK_ctrtri( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint nounit = LAPACK(lsame)(diag, "N"); - const blasint unit = LAPACK(lsame)(diag, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ctrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("CTRTRI", &minfo, strlen("CTRTRI")); + const int minfo = -*info; + LAPACK(xerbla)("CTRTRI", &minfo); return; } @@ -42,7 +42,7 @@ void RELAPACK_ctrtri( // check for singularity if (nounit) { - blasint i; + int i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ctrtri( /** ctrtri's recursive compute kernel */ static void RELAPACK_ctrtri_rec( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_CTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ctrtri_rec( const float MONE[] = { -1., 0. }; // Splitting - const blasint n1 = CREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = CREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dgbtrf.c b/relapack/src/dgbtrf.c index cdf06ad5b..1a1757d31 100644 --- a/relapack/src/dgbtrf.c +++ b/relapack/src/dgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" -#include -#include -static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint *, - const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, - const blasint *, blasint *); +#include "stdlib.h" + +static void RELAPACK_dgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); /** DGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_dgbtrf_rec(const blasint *, const blasint *, const blasint * http://www.netlib.org/lapack/explore-html/da/d87/dgbtrf_8f.html * */ void RELAPACK_dgbtrf( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_dgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DGBTRF", &minfo, strlen("DGBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("DGBTRF", &minfo); return; } @@ -40,14 +40,14 @@ void RELAPACK_dgbtrf( const double ZERO[] = { 0. }; // Result upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Zero upper diagonal fill-in elements - blasint i, j; + int i, j; for (j = 0; j < *n; j++) { double *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,12 +55,11 @@ void RELAPACK_dgbtrf( } // Allocate work space - const blasint n1 = DREC_SPLIT(*n); - const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv); - const blasint nWorkl = abs( (kv > n1) ? n1 : kv); - const blasint mWorku = abs( (*kl > n1) ? n1 : *kl); -// const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl); - const blasint nWorku = abs( (*kl > n1) ? MAX(1, *n - *kl) : *kl); + const int n1 = DREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; double *Workl = malloc(mWorkl * nWorkl * sizeof(double)); double *Worku = malloc(mWorku * nWorku * sizeof(double)); LAPACK(dlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -77,10 +76,10 @@ void RELAPACK_dgbtrf( /** dgbtrf's recursive compute kernel */ static void RELAPACK_dgbtrf_rec( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info ) { if (*n <= MAX(CROSSOVER_DGBTRF, 1)) { @@ -92,25 +91,25 @@ static void RELAPACK_dgbtrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterators - blasint i, j; + int i, j; // Output upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + kv; // Splitting - const blasint n1 = MIN(DREC_SPLIT(*n), *kl); - const blasint n2 = *n - n1; - const blasint m1 = MIN(n1, *m); - const blasint m2 = *m - m1; - const blasint mn1 = MIN(m1, n1); - const blasint mn2 = MIN(m2, n2); + const int n1 = MIN(DREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -130,14 +129,14 @@ static void RELAPACK_dgbtrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // Banded splitting - const blasint n21 = MIN(n2, kv - n1); - const blasint n22 = MIN(n2 - n21, n1); - const blasint m21 = MIN(m2, *kl - m1); - const blasint m22 = MIN(m2 - m21, m1); + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -165,7 +164,7 @@ static void RELAPACK_dgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -181,7 +180,7 @@ static void RELAPACK_dgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { const double tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -209,7 +208,7 @@ static void RELAPACK_dgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(dswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -219,8 +218,7 @@ static void RELAPACK_dgbtrf_rec( } // recursion(Ab_BR, ipiv_B) -// RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(dgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + RELAPACK_dgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/dgemmt.c b/relapack/src/dgemmt.c index 1ceab6c37..9c925b586 100644 --- a/relapack/src/dgemmt.c +++ b/relapack/src/dgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_dgemmt_rec(const char *, const char *, const char *, - const blasint *, const blasint *, const double *, const double *, const blasint *, - const double *, const blasint *, const double *, double *, const blasint *); + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, - const blasint *, const blasint *, const double *, const double *, const blasint *, - const double *, const blasint *, const double *, double *, const blasint *); + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); /** DGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_dgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_dgemmt( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_dgemmt( #else // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint notransA = LAPACK(lsame)(transA, "N"); - const blasint tranA = LAPACK(lsame)(transA, "T"); - const blasint notransB = LAPACK(lsame)(transB, "N"); - const blasint tranB = LAPACK(lsame)(transB, "T"); - blasint info = 0; + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_dgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("DGEMMT", &info, strlen("DGEMMT")); + LAPACK(xerbla)("DGEMMT", &info); return; } @@ -74,10 +74,10 @@ void RELAPACK_dgemmt( /** dgemmt's recursive compute kernel */ static void RELAPACK_dgemmt_rec( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { if (*n <= MAX(CROSSOVER_DGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_dgemmt_rec( } // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_dgemmt_rec( /** dgemmt's unblocked compute kernel */ static void RELAPACK_dgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { - const blasint incB = (*transB == 'N') ? 1 : *ldB; - const blasint incC = 1; + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; - blasint i; + int i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_dgemmt_rec2( double *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const blasint nmi = *n - i; + const int nmi = *n - i; if (*transA == 'N') BLAS(dgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(dgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const blasint ip1 = i + 1; + const int ip1 = i + 1; if (*transA == 'N') BLAS(dgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/dgetrf.c b/relapack/src/dgetrf.c index be960fde9..07f5472fd 100644 --- a/relapack/src/dgetrf.c +++ b/relapack/src/dgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dgetrf_rec(const blasint *, const blasint *, double *, - const blasint *, blasint *, blasint *); +static void RELAPACK_dgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); /** DGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,25 +11,26 @@ static void RELAPACK_dgetrf_rec(const blasint *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/d3/d6a/dgetrf_8f.html * */ void RELAPACK_dgetrf( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { + // Check arguments *info = 0; if (*m < 0) *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *m)) + else if (*ldA < MAX(1, *n)) *info = -4; - if (*info!=0) { - const blasint minfo = -*info; - LAPACK(xerbla)("DGETRF", &minfo, strlen("DGETRF")); + if (*info) { + const int minfo = -*info; + LAPACK(xerbla)("DGETRF", &minfo); return; } - const blasint sn = MIN(*m, *n); + const int sn = MIN(*m, *n); RELAPACK_dgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -37,10 +38,10 @@ void RELAPACK_dgetrf( if (*m < *n) { // Constants const double ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const int iONE[] = { 1. }; // Splitting - const blasint rn = *n - *m; + const int rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -56,9 +57,9 @@ void RELAPACK_dgetrf( /** dgetrf's recursive compute kernel */ static void RELAPACK_dgetrf_rec( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { if (*n <= MAX(CROSSOVER_DGETRF, 1)) { @@ -70,12 +71,12 @@ static void RELAPACK_dgetrf_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; - const blasint m2 = *m - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -90,8 +91,8 @@ static void RELAPACK_dgetrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_dgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -110,7 +111,7 @@ static void RELAPACK_dgetrf_rec( // apply pivots to A_BL LAPACK(dlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/dlauum.c b/relapack/src/dlauum.c index 6c7dcccb3..d722ea809 100644 --- a/relapack/src/dlauum.c +++ b/relapack/src/dlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dlauum_rec(const char *, const blasint *, double *, - const blasint *, blasint *); +static void RELAPACK_dlauum_rec(const char *, const int *, double *, + const int *, int *); /** DLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_dlauum_rec(const char *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/d0/dc2/dlauum_8f.html * */ void RELAPACK_dlauum( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DLAUUM", &minfo, strlen("DLAUUM")); + const int minfo = -*info; + LAPACK(xerbla)("DLAUUM", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_dlauum( /** dlauum's recursive compute kernel */ static void RELAPACK_dlauum_rec( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_DLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_dlauum_rec( const double ONE[] = { 1. }; // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dpbtrf.c b/relapack/src/dpbtrf.c index 9380b28ad..6fd0ebe48 100644 --- a/relapack/src/dpbtrf.c +++ b/relapack/src/dpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_dpbtrf_rec(const char *, const blasint *, const blasint *, - double *, const blasint *, double *, const blasint *, blasint *); +static void RELAPACK_dpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); /** DPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_dpbtrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/df/da9/dpbtrf_8f.html * */ void RELAPACK_dpbtrf( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_dpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DPBTRF", &minfo, strlen("DPBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("DPBTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpbtrf( const double ZERO[] = { 0. }; // Allocate work space - const blasint n1 = DREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const int n1 = DREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * sizeof(double)); LAPACK(dlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_dpbtrf( /** dpbtrf's recursive compute kernel */ static void RELAPACK_dpbtrf_rec( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - double *Work, const blasint *ldWork, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info ){ if (*n <= MAX(CROSSOVER_DPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_dpbtrf_rec( const double MONE[] = { -1. }; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const blasint n1 = MIN(DREC_SPLIT(*n), *kd); - const blasint n2 = *n - n1; + const int n1 = MIN(DREC_SPLIT(*n), *kd); + const int n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_dpbtrf_rec( return; // Banded splitting - const blasint n21 = MIN(n2, *kd - n1); - const blasint n22 = MIN(n2 - n21, n1); + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, n1); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/dpotrf.c b/relapack/src/dpotrf.c index cf326b18f..c14fb3d71 100644 --- a/relapack/src/dpotrf.c +++ b/relapack/src/dpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dpotrf_rec(const char *, const blasint *, double *, - const blasint *, blasint *); +static void RELAPACK_dpotrf_rec(const char *, const int *, double *, + const int *, int *); /** DPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_dpotrf_rec(const char *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/d0/d8a/dpotrf_8f.html * */ void RELAPACK_dpotrf( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_dpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DPOTRF", &minfo, strlen("DPOTRF")); + const int minfo = -*info; + LAPACK(xerbla)("DPOTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_dpotrf( /** dpotrf's recursive compute kernel */ static void RELAPACK_dpotrf_rec( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_DPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_dpotrf_rec( const double MONE[] = { -1. }; // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsygst.c b/relapack/src/dsygst.c index f68241e3a..0228068ce 100644 --- a/relapack/src/dsygst.c +++ b/relapack/src/dsygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_dsygst_rec(const blasint *, const char *, const blasint *, - double *, const blasint *, const double *, const blasint *, - double *, const blasint *, blasint *); +static void RELAPACK_dsygst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); /** DSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_dsygst_rec(const blasint *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/dc/d04/dsygst_8f.html * */ void RELAPACK_dsygst( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_dsygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DSYGST", &minfo, strlen("DSYGST")); + const int minfo = -*info; + LAPACK(xerbla)("DSYGST", &minfo); return; } @@ -45,10 +45,10 @@ void RELAPACK_dsygst( // Allocate work space double *Work = NULL; - blasint lWork = 0; + int lWork = 0; #if XSYGST_ALLOW_MALLOC - const blasint n1 = DREC_SPLIT(*n); - lWork = abs( n1 * (*n - n1) ); + const int n1 = DREC_SPLIT(*n); + lWork = n1 * (*n - n1); Work = malloc(lWork * sizeof(double)); if (!Work) lWork = 0; @@ -67,9 +67,9 @@ void RELAPACK_dsygst( /** dsygst's recursive compute kernel */ static void RELAPACK_dsygst_rec( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *Work, const blasint *lWork, blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_dsygst_rec( const double MONE[] = { -1. }; const double HALF[] = { .5 }; const double MHALF[] = { -.5 }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/dsytrf.c b/relapack/src/dsytrf.c index 43d28f94e..80b119336 100644 --- a/relapack/src/dsytrf.c +++ b/relapack/src/dsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_dsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** DSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/dd/df4/dsytrf_8f.html * */ void RELAPACK_dsytrf( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - blasint nout; + int nout; // Recursive kernel RELAPACK_dsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf( /** dsytrf's recursive compute kernel */ static void RELAPACK_dsytrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_dsytrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = DREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_dsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_dsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_dsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = DREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_dsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_dsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_dsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rec2.c b/relapack/src/dsytrf_rec2.c index 6ed1a47a2..72ef827b1 100644 --- a/relapack/src/dsytrf_rec2.c +++ b/relapack/src/dsytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; static double c_b8 = -1.; static double c_b9 = 1.; @@ -25,33 +25,33 @@ static double c_b9 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, - double *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1, d__2, d__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static blasint j, k; + static int j, k; static double t, r1, d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; - extern /* Subroutine */ blasint dscal_(int *, double *, double *, - blasint *); + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, - double *, double *, blasint *, double *, blasint *, - double *, double *, blasint *, ftnlen), dcopy_(int *, - double *, blasint *, double *, blasint *), dswap_(int - *, double *, blasint *, double *, blasint *); - static blasint kstep; + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen), dcopy_(int *, + double *, int *, double *, int *), dswap_(int + *, double *, int *, double *, int *); + static int kstep; static double absakk; - extern blasint idamax_(int *, double *, blasint *); + extern int idamax_(int *, double *, int *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dsytrf_rook.c b/relapack/src/dsytrf_rook.c index 78fa652ab..19a875c7a 100644 --- a/relapack/src/dsytrf_rook.c +++ b/relapack/src/dsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_dsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_dsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** DSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_dsytrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/db/df4/dsytrf__rook_8f.html * */ void RELAPACK_dsytrf_rook( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_dsytrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DSYTRF", &minfo, strlen("DSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("DSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_dsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_dsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_dsytrf_rook( /** dsytrf_rook's recursive compute kernel */ static void RELAPACK_dsytrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_DSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_dsytrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = DREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = DREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + n1; double *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_dgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(dgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_dsytrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_dsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = DREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = DREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_dsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_dsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_dgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(dgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_dsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(dgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/dsytrf_rook_rec2.c b/relapack/src/dsytrf_rook_rec2.c index bdb5c6e29..105ef5ed3 100644 --- a/relapack/src/dsytrf_rook_rec2.c +++ b/relapack/src/dsytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; static double c_b9 = -1.; static double c_b10 = 1.; @@ -25,39 +25,39 @@ static double c_b10 = 1.; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, double *a, blasint *lda, blasint *ipiv, - double *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_dsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, double *a, int *lda, int *ipiv, + double *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; double d__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static double t, r1, d11, d12, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static double alpha; - extern /* Subroutine */ blasint dscal_(int *, double *, double *, - blasint *); + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint dgemv_(char *, blasint *, blasint *, - double *, double *, blasint *, double *, blasint *, - double *, double *, blasint *, ftnlen); + extern /* Subroutine */ int dgemv_(char *, int *, int *, + double *, double *, int *, double *, int *, + double *, double *, int *, ftnlen); static double dtemp, sfmin; - static blasint itemp; - extern /* Subroutine */ blasint dcopy_(int *, double *, blasint *, - double *, blasint *), dswap_(int *, double *, int - *, double *, blasint *); - static blasint kstep; + static int itemp; + extern /* Subroutine */ int dcopy_(int *, double *, int *, + double *, int *), dswap_(int *, double *, int + *, double *, int *); + static int kstep; extern double dlamch_(char *, ftnlen); static double absakk; - extern blasint idamax_(int *, double *, blasint *); + extern int idamax_(int *, double *, int *); static double colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/dtgsyl.c b/relapack/src/dtgsyl.c index 9bbc987e7..c506926af 100644 --- a/relapack/src/dtgsyl.c +++ b/relapack/src/dtgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_dtgsyl_rec(const char *, const blasint *, const blasint *, - const blasint *, const double *, const blasint *, const double *, const blasint *, - double *, const blasint *, const double *, const blasint *, const double *, - const blasint *, double *, const blasint *, double *, double *, double *, blasint *, - blasint *, blasint *); +static void RELAPACK_dtgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *, + int *, int *); /** DTGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_dtgsyl_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/db/d88/dtgsyl_8f.html * */ void RELAPACK_dtgsyl( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dif, - double *Work, const blasint *lWork, blasint *iWork, blasint *info + double *Work, const int *lWork, int *iWork, int *info ) { // Parse arguments - const blasint notran = LAPACK(lsame)(trans, "N"); - const blasint tran = LAPACK(lsame)(trans, "T"); + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - blasint lwmin = 1; + int lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_dtgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DTGSYL", &minfo, strlen("DTGSYL")); + const int minfo = -*info; + LAPACK(xerbla)("DTGSYL", &minfo); return; } @@ -75,8 +75,8 @@ void RELAPACK_dtgsyl( // Constant const double ZERO[] = { 0. }; - blasint isolve = 1; - blasint ifunc = 0; + int isolve = 1; + int ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_dtgsyl( } double scale2; - blasint iround; + int iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; double dsum = 1; - blasint pq; + int pq; RELAPACK_dtgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_dtgsyl( /** dtgsyl's recursive vompute kernel */ static void RELAPACK_dtgsyl_rec( - const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dsum, double *dscale, - blasint *iWork, blasint *pq, blasint *info + int *iWork, int *pq, int *info ) { if (*m <= MAX(CROSSOVER_DTGSYL, 1) && *n <= MAX(CROSSOVER_DTGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_dtgsyl_rec( // Constants const double ONE[] = { 1. }; const double MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - blasint m1 = DREC_SPLIT(*m); + int m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const blasint m2 = *m - m1; + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_dtgsyl_rec( } } else { // Splitting - blasint n1 = DREC_SPLIT(*n); + int n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const blasint n2 = *n - n1; + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl.c b/relapack/src/dtrsyl.c index 766377300..c87b53ae5 100644 --- a/relapack/src/dtrsyl.c +++ b/relapack/src/dtrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_dtrsyl_rec(const char *, const char *, const blasint *, - const blasint *, const blasint *, const double *, const blasint *, const double *, - const blasint *, double *, const blasint *, double *, blasint *); +static void RELAPACK_dtrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); /** DTRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_dtrsyl_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d6/d43/dtrsyl_8f.html * */ void RELAPACK_dtrsyl( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { // Check arguments - const blasint notransA = LAPACK(lsame)(tranA, "N"); - const blasint transA = LAPACK(lsame)(tranA, "T"); - const blasint ctransA = LAPACK(lsame)(tranA, "C"); - const blasint notransB = LAPACK(lsame)(tranB, "N"); - const blasint transB = LAPACK(lsame)(tranB, "T"); - const blasint ctransB = LAPACK(lsame)(tranB, "C"); + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_dtrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DTRSYL", &minfo, strlen("DTRSYL")); + const int minfo = -*info; + LAPACK(xerbla)("DTRSYL", &minfo); return; } @@ -60,11 +60,11 @@ void RELAPACK_dtrsyl( /** dtrsyl's recursive compute kernel */ static void RELAPACK_dtrsyl_rec( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { if (*m <= MAX(CROSSOVER_DTRSYL, 1) && *n <= MAX(CROSSOVER_DTRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_dtrsyl_rec( const double ONE[] = { 1. }; const double MONE[] = { -1. }; const double MSGN[] = { -*isgn }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs double scale1[] = { 1. }; double scale2[] = { 1. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - blasint m1 = DREC_SPLIT(*m); + int m1 = DREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const blasint m2 = *m - m1; + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_dtrsyl_rec( } } else { // Splitting - blasint n1 = DREC_SPLIT(*n); + int n1 = DREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const blasint n2 = *n - n1; + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/dtrsyl_rec2.c b/relapack/src/dtrsyl_rec2.c index 50dabf76d..479c7f340 100644 --- a/relapack/src/dtrsyl_rec2.c +++ b/relapack/src/dtrsyl_rec2.c @@ -14,52 +14,52 @@ /* Table of constant values */ -static blasint c__1 = 1; -static blasint c_false = FALSE_; -static blasint c__2 = 2; +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; static double c_b26 = 1.; static double c_b30 = 0.; -static blasint c_true = TRUE_; +static int c_true = TRUE_; -int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, blasint *isgn, int - *m, blasint *n, double *a, blasint *lda, double *b, blasint * - ldb, double *c__, blasint *ldc, double *scale, blasint *info, +int RELAPACK_dtrsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, double *a, int *lda, double *b, int * + ldb, double *c__, int *ldc, double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; /* Local variables */ - static blasint j, k, l; + static int j, k, l; static double x[4] /* was [2][2] */; - static blasint k1, k2, l1, l2; + static int k1, k2, l1, l2; static double a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - extern double ddot_(int *, double *, blasint *, double *, - blasint *); - static blasint ierr; + extern double ddot_(int *, double *, int *, double *, + int *); + static int ierr; static double smin, suml, sumr; - extern /* Subroutine */ blasint dscal_(int *, double *, double *, - blasint *); - extern blasint lsame_(char *, char *, ftnlen, ftnlen); - static blasint knext, lnext; + extern /* Subroutine */ int dscal_(int *, double *, double *, + int *); + extern int lsame_(char *, char *, ftnlen, ftnlen); + static int knext, lnext; static double xnorm; - extern /* Subroutine */ blasint dlaln2_(int *, blasint *, blasint *, - double *, double *, double *, blasint *, double *, - double *, double *, blasint *, double *, double * - , double *, blasint *, double *, double *, blasint *), - dlasy2_(int *, blasint *, blasint *, blasint *, blasint *, - double *, blasint *, double *, blasint *, double *, - blasint *, double *, double *, blasint *, double *, - blasint *), dlabad_(double *, double *); - extern double dlamch_(char *, ftnlen), dlange_(char *, blasint *, - blasint *, double *, blasint *, double *, ftnlen); + extern /* Subroutine */ int dlaln2_(int *, int *, int *, + double *, double *, double *, int *, double *, + double *, double *, int *, double *, double * + , double *, int *, double *, double *, int *), + dlasy2_(int *, int *, int *, int *, int *, + double *, int *, double *, int *, double *, + int *, double *, double *, int *, double *, + int *), dlabad_(double *, double *); + extern double dlamch_(char *, ftnlen), dlange_(char *, int *, + int *, double *, int *, double *, ftnlen); static double scaloc; - extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); static double bignum; - static blasint notrna, notrnb; + static int notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/dtrtri.c b/relapack/src/dtrtri.c index 72777e7e4..0462609e9 100644 --- a/relapack/src/dtrtri.c +++ b/relapack/src/dtrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_dtrtri_rec(const char *, const char *, const blasint *, - double *, const blasint *, blasint *); +static void RELAPACK_dtrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); /** DTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_dtrtri_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d5/dba/dtrtri_8f.html * */ void RELAPACK_dtrtri( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint nounit = LAPACK(lsame)(diag, "N"); - const blasint unit = LAPACK(lsame)(diag, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_dtrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("DTRTRI", &minfo, strlen("DTRTRI")); + const int minfo = -*info; + LAPACK(xerbla)("DTRTRI", &minfo); return; } @@ -42,7 +42,7 @@ void RELAPACK_dtrtri( // check for singularity if (nounit) { - blasint i; + int i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_dtrtri( /** dtrtri's recursive compute kernel */ static void RELAPACK_dtrtri_rec( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_DTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_dtrtri_rec( const double MONE[] = { -1. }; // Splitting - const blasint n1 = DREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = DREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/f2c.c b/relapack/src/f2c.c index 48539c4b9..5a3452419 100644 --- a/relapack/src/f2c.c +++ b/relapack/src/f2c.c @@ -9,7 +9,7 @@ #endif #endif -void sig_die(const char *s, blasint kill) { +void sig_die(const char *s, int kill) { /* print error message, then clear buffers */ fprintf(stderr, "%s\n", s); diff --git a/relapack/src/f2c.h b/relapack/src/f2c.h index 85337becf..b94ee7c8e 100644 --- a/relapack/src/f2c.h +++ b/relapack/src/f2c.h @@ -7,19 +7,6 @@ #ifndef F2C_INCLUDE #define F2C_INCLUDE -#ifdef USE64BITINT -typedef BLASLONG blasint; -#if defined(OS_WINDOWS) && defined(__64BIT__) -#define blasabs(x) llabs(x) -#else -#define blasabs(x) labs(x) -#endif -#else -typedef int blasint; -#define blasabs(x) abs(x) -#endif - - typedef long int integer; typedef unsigned long int uinteger; typedef char *address; diff --git a/relapack/src/lapack.h b/relapack/src/lapack.h index 776b0589f..064276b7e 100644 --- a/relapack/src/lapack.h +++ b/relapack/src/lapack.h @@ -1,80 +1,80 @@ #ifndef LAPACK_H #define LAPACK_H -extern blasint LAPACK(lsame)(const char *, const char *); -extern blasint LAPACK(xerbla)(const char *, const blasint *, int); +extern int LAPACK(lsame)(const char *, const char *); +extern int LAPACK(xerbla)(const char *, const int *); -extern void LAPACK(slaswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); -extern void LAPACK(dlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); -extern void LAPACK(claswp)(const blasint *, float *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); -extern void LAPACK(zlaswp)(const blasint *, double *, const blasint *, const blasint *, const blasint *, const blasint *, const blasint *); +extern void LAPACK(slaswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(dlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(claswp)(const int *, float *, const int *, const int *, const int *, const int *, const int *); +extern void LAPACK(zlaswp)(const int *, double *, const int *, const int *, const int *, const int *, const int *); -extern void LAPACK(slaset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); -extern void LAPACK(dlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); -extern void LAPACK(claset)(const char *, const blasint *, const blasint *, const float *, const float *, float *, const blasint *); -extern void LAPACK(zlaset)(const char *, const blasint *, const blasint *, const double *, const double *, double *, const blasint *); +extern void LAPACK(slaset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(dlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); +extern void LAPACK(claset)(const char *, const int *, const int *, const float *, const float *, float *, const int *); +extern void LAPACK(zlaset)(const char *, const int *, const int *, const double *, const double *, double *, const int *); -extern void LAPACK(slacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); -extern void LAPACK(dlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); -extern void LAPACK(clacpy)(const char *, const blasint *, const blasint *, const float *, const blasint *, float *, const blasint *); -extern void LAPACK(zlacpy)(const char *, const blasint *, const blasint *, const double *, const blasint *, double *, const blasint *); +extern void LAPACK(slacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(dlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); +extern void LAPACK(clacpy)(const char *, const int *, const int *, const float *, const int *, float *, const int *); +extern void LAPACK(zlacpy)(const char *, const int *, const int *, const double *, const int *, double *, const int *); -extern void LAPACK(slascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(dlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(clascl)(const char *, const blasint *, const blasint *, const float *, const float *, const blasint *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(zlascl)(const char *, const blasint *, const blasint *, const double *, const double *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(slascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(clascl)(const char *, const int *, const int *, const float *, const float *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zlascl)(const char *, const int *, const int *, const double *, const double *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(slauu2)(const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(dlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(clauu2)(const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(zlauu2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(slauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dlauu2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(clauu2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zlauu2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(ssygs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); -extern void LAPACK(dsygs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); -extern void LAPACK(chegs2)(const blasint *, const char *, const blasint *, float *, const blasint *, const float *, const blasint *, blasint *); -extern void LAPACK(zhegs2)(const blasint *, const char *, const blasint *, double *, const blasint *, const double *, const blasint *, blasint *); +extern void LAPACK(ssygs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(dsygs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); +extern void LAPACK(chegs2)(const int *, const char *, const int *, float *, const int *, const float *, const int *, int *); +extern void LAPACK(zhegs2)(const int *, const char *, const int *, double *, const int *, const double *, const int *, int *); -extern void LAPACK(strti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(dtrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(ctrti2)(const char *, const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(ztrti2)(const char *, const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(strti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(dtrti2)(const char *, const char *, const int *, double *, const int *, int *); +extern void LAPACK(ctrti2)(const char *, const char *, const int *, float *, const int *, int *); +extern void LAPACK(ztrti2)(const char *, const char *, const int *, double *, const int *, int *); -extern void LAPACK(spotf2)(const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(dpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(cpotf2)(const char *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(zpotf2)(const char *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(spotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(dpotf2)(const char *, const int *, double *, const int *, int *); +extern void LAPACK(cpotf2)(const char *, const int *, float *, const int *, int *); +extern void LAPACK(zpotf2)(const char *, const int *, double *, const int *, int *); -extern void LAPACK(spbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(dpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); -extern void LAPACK(cpbtf2)(const char *, const blasint *, const blasint *, float *, const blasint *, blasint *); -extern void LAPACK(zpbtf2)(const char *, const blasint *, const blasint *, double *, const blasint *, blasint *); +extern void LAPACK(spbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(dpbtf2)(const char *, const int *, const int *, double *, const int *, int *); +extern void LAPACK(cpbtf2)(const char *, const int *, const int *, float *, const int *, int *); +extern void LAPACK(zpbtf2)(const char *, const int *, const int *, double *, const int *, int *); -extern void LAPACK(ssytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(dsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(csytf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(chetf2)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(zsytf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(zhetf2)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(ssytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(dsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(csytf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(chetf2_rook)(const char *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(zsytf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(zhetf2_rook)(const char *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(ssytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(ssytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(csytf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(chetf2_rook)(const char *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zsytf2_rook)(const char *, const int *, double *, const int *, int *, int *); +extern void LAPACK(zhetf2_rook)(const char *, const int *, double *, const int *, int *, int *); -extern void LAPACK(sgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(dgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(cgetf2)(const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(zgetf2)(const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(sgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgetf2)(const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgetf2)(const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgetf2)(const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(sgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(dgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); -extern void LAPACK(cgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, float *, const blasint *, blasint *, blasint *); -extern void LAPACK(zgbtf2)(const blasint *, const blasint *, const blasint *, const blasint *, double *, const blasint *, blasint *, blasint *); +extern void LAPACK(sgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(dgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); +extern void LAPACK(cgbtf2)(const int *, const int *, const int *, const int *, float *, const int *, int *, int *); +extern void LAPACK(zgbtf2)(const int *, const int *, const int *, const int *, double *, const int *, int *, int *); -extern void LAPACK(stgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, blasint *); -extern void LAPACK(dtgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *, blasint *, blasint *); -extern void LAPACK(ctgsy2)(const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, float *, float *, blasint *); -extern void LAPACK(ztgsy2)(const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, double *, double *, blasint *); +extern void LAPACK(stgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *, int *, int *); +extern void LAPACK(dtgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *, int *, int *); +extern void LAPACK(ctgsy2)(const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, float *, float *, int *); +extern void LAPACK(ztgsy2)(const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, double *, double *, int *); #endif /* LAPACK_H */ diff --git a/relapack/src/lapack_wrappers.c b/relapack/src/lapack_wrappers.c index 0252f3d92..488547260 100644 --- a/relapack/src/lapack_wrappers.c +++ b/relapack/src/lapack_wrappers.c @@ -6,9 +6,9 @@ #if INCLUDE_SLAUUM void LAPACK(slauum)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_slauum(uplo, n, A, ldA, info); } @@ -16,9 +16,9 @@ void LAPACK(slauum)( #if INCLUDE_DLAUUM void LAPACK(dlauum)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_dlauum(uplo, n, A, ldA, info); } @@ -26,9 +26,9 @@ void LAPACK(dlauum)( #if INCLUDE_CLAUUM void LAPACK(clauum)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_clauum(uplo, n, A, ldA, info); } @@ -36,9 +36,9 @@ void LAPACK(clauum)( #if INCLUDE_ZLAUUM void LAPACK(zlauum)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_zlauum(uplo, n, A, ldA, info); } @@ -51,9 +51,9 @@ void LAPACK(zlauum)( #if INCLUDE_SSYGST void LAPACK(ssygst)( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info ) { RELAPACK_ssygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -61,9 +61,9 @@ void LAPACK(ssygst)( #if INCLUDE_DSYGST void LAPACK(dsygst)( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info ) { RELAPACK_dsygst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -71,9 +71,9 @@ void LAPACK(dsygst)( #if INCLUDE_CHEGST void LAPACK(chegst)( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info ) { RELAPACK_chegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -81,9 +81,9 @@ void LAPACK(chegst)( #if INCLUDE_ZHEGST void LAPACK(zhegst)( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info ) { RELAPACK_zhegst(itype, uplo, n, A, ldA, B, ldB, info); } @@ -96,9 +96,9 @@ void LAPACK(zhegst)( #if INCLUDE_STRTRI void LAPACK(strtri)( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_strtri(uplo, diag, n, A, ldA, info); } @@ -106,9 +106,9 @@ void LAPACK(strtri)( #if INCLUDE_DTRTRI void LAPACK(dtrtri)( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_dtrtri(uplo, diag, n, A, ldA, info); } @@ -116,9 +116,9 @@ void LAPACK(dtrtri)( #if INCLUDE_CTRTRI void LAPACK(ctrtri)( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_ctrtri(uplo, diag, n, A, ldA, info); } @@ -126,9 +126,9 @@ void LAPACK(ctrtri)( #if INCLUDE_ZTRTRI void LAPACK(ztrtri)( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_ztrtri(uplo, diag, n, A, ldA, info); } @@ -141,9 +141,9 @@ void LAPACK(ztrtri)( #if INCLUDE_SPOTRF void LAPACK(spotrf)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_spotrf(uplo, n, A, ldA, info); } @@ -151,9 +151,9 @@ void LAPACK(spotrf)( #if INCLUDE_DPOTRF void LAPACK(dpotrf)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_dpotrf(uplo, n, A, ldA, info); } @@ -161,9 +161,9 @@ void LAPACK(dpotrf)( #if INCLUDE_CPOTRF void LAPACK(cpotrf)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { RELAPACK_cpotrf(uplo, n, A, ldA, info); } @@ -171,9 +171,9 @@ void LAPACK(cpotrf)( #if INCLUDE_ZPOTRF void LAPACK(zpotrf)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { RELAPACK_zpotrf(uplo, n, A, ldA, info); } @@ -186,9 +186,9 @@ void LAPACK(zpotrf)( #if INCLUDE_SPBTRF void LAPACK(spbtrf)( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info ) { RELAPACK_spbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -196,9 +196,9 @@ void LAPACK(spbtrf)( #if INCLUDE_DPBTRF void LAPACK(dpbtrf)( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info ) { RELAPACK_dpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -206,9 +206,9 @@ void LAPACK(dpbtrf)( #if INCLUDE_CPBTRF void LAPACK(cpbtrf)( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info ) { RELAPACK_cpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -216,9 +216,9 @@ void LAPACK(cpbtrf)( #if INCLUDE_ZPBTRF void LAPACK(zpbtrf)( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info ) { RELAPACK_zpbtrf(uplo, n, kd, Ab, ldAb, info); } @@ -231,9 +231,9 @@ void LAPACK(zpbtrf)( #if INCLUDE_SSYTRF void LAPACK(ssytrf)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_ssytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -241,9 +241,9 @@ void LAPACK(ssytrf)( #if INCLUDE_DSYTRF void LAPACK(dsytrf)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_dsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -251,9 +251,9 @@ void LAPACK(dsytrf)( #if INCLUDE_CSYTRF void LAPACK(csytrf)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_csytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -261,9 +261,9 @@ void LAPACK(csytrf)( #if INCLUDE_ZSYTRF void LAPACK(zsytrf)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_zsytrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -271,9 +271,9 @@ void LAPACK(zsytrf)( #if INCLUDE_CHETRF void LAPACK(chetrf)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_chetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -281,9 +281,9 @@ void LAPACK(chetrf)( #if INCLUDE_ZHETRF void LAPACK(zhetrf)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_zhetrf(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -291,9 +291,9 @@ void LAPACK(zhetrf)( #if INCLUDE_SSYTRF_ROOK void LAPACK(ssytrf_rook)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_ssytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -301,9 +301,9 @@ void LAPACK(ssytrf_rook)( #if INCLUDE_DSYTRF_ROOK void LAPACK(dsytrf_rook)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_dsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -311,9 +311,9 @@ void LAPACK(dsytrf_rook)( #if INCLUDE_CSYTRF_ROOK void LAPACK(csytrf_rook)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_csytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -321,9 +321,9 @@ void LAPACK(csytrf_rook)( #if INCLUDE_ZSYTRF_ROOK void LAPACK(zsytrf_rook)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_zsytrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -331,9 +331,9 @@ void LAPACK(zsytrf_rook)( #if INCLUDE_CHETRF_ROOK void LAPACK(chetrf_rook)( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { RELAPACK_chetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -341,9 +341,9 @@ void LAPACK(chetrf_rook)( #if INCLUDE_ZHETRF_ROOK void LAPACK(zhetrf_rook)( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { RELAPACK_zhetrf_rook(uplo, n, A, ldA, ipiv, Work, lWork, info); } @@ -356,9 +356,9 @@ void LAPACK(zhetrf_rook)( #if INCLUDE_SGETRF void LAPACK(sgetrf)( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { RELAPACK_sgetrf(m, n, A, ldA, ipiv, info); } @@ -366,9 +366,9 @@ void LAPACK(sgetrf)( #if INCLUDE_DGETRF void LAPACK(dgetrf)( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { RELAPACK_dgetrf(m, n, A, ldA, ipiv, info); } @@ -376,9 +376,9 @@ void LAPACK(dgetrf)( #if INCLUDE_CGETRF void LAPACK(cgetrf)( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { RELAPACK_cgetrf(m, n, A, ldA, ipiv, info); } @@ -386,9 +386,9 @@ void LAPACK(cgetrf)( #if INCLUDE_ZGETRF void LAPACK(zgetrf)( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { RELAPACK_zgetrf(m, n, A, ldA, ipiv, info); } @@ -401,9 +401,9 @@ void LAPACK(zgetrf)( #if INCLUDE_SGBTRF void LAPACK(sgbtrf)( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info ) { RELAPACK_sgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -411,9 +411,9 @@ void LAPACK(sgbtrf)( #if INCLUDE_DGBTRF void LAPACK(dgbtrf)( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info ) { RELAPACK_dgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -421,9 +421,9 @@ void LAPACK(dgbtrf)( #if INCLUDE_CGBTRF void LAPACK(cgbtrf)( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info ) { RELAPACK_cgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -431,9 +431,9 @@ void LAPACK(cgbtrf)( #if INCLUDE_ZGBTRF void LAPACK(zgbtrf)( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info ) { RELAPACK_zgbtrf(m, n, kl, ku, Ab, ldAb, ipiv, info); } @@ -446,11 +446,11 @@ void LAPACK(zgbtrf)( #if INCLUDE_STRSYL void LAPACK(strsyl)( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { RELAPACK_strsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -458,11 +458,11 @@ void LAPACK(strsyl)( #if INCLUDE_DTRSYL void LAPACK(dtrsyl)( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { RELAPACK_dtrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -470,11 +470,11 @@ void LAPACK(dtrsyl)( #if INCLUDE_CTRSYL void LAPACK(ctrsyl)( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { RELAPACK_ctrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -482,11 +482,11 @@ void LAPACK(ctrsyl)( #if INCLUDE_ZTRSYL void LAPACK(ztrsyl)( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { RELAPACK_ztrsyl(tranA, tranB, isgn, m, n, A, ldA, B, ldB, C, ldC, scale, info); } @@ -499,13 +499,13 @@ void LAPACK(ztrsyl)( #if INCLUDE_STGSYL void LAPACK(stgsyl)( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dif, - float *Work, const blasint *lWork, blasint *iWork, blasint *info + float *Work, const int *lWork, int *iWork, int *info ) { RELAPACK_stgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -513,13 +513,13 @@ void LAPACK(stgsyl)( #if INCLUDE_DTGSYL void LAPACK(dtgsyl)( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dif, - double *Work, const blasint *lWork, blasint *iWork, blasint *info + double *Work, const int *lWork, int *iWork, int *info ) { RELAPACK_dtgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -527,13 +527,13 @@ void LAPACK(dtgsyl)( #if INCLUDE_CTGSYL void LAPACK(ctgsyl)( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dif, - float *Work, const blasint *lWork, blasint *iWork, blasint *info + float *Work, const int *lWork, int *iWork, int *info ) { RELAPACK_ctgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -541,13 +541,13 @@ void LAPACK(ctgsyl)( #if INCLUDE_ZTGSYL void LAPACK(ztgsyl)( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dif, - double *Work, const blasint *lWork, blasint *iWork, blasint *info + double *Work, const int *lWork, int *iWork, int *info ) { RELAPACK_ztgsyl(trans, ijob, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, dif, Work, lWork, iWork, info); } @@ -561,10 +561,10 @@ void LAPACK(ztgsyl)( #if INCLUDE_SGEMMT void LAPACK(sgemmt)( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { RELAPACK_sgemmt(uplo, n, A, ldA, info); } @@ -573,10 +573,10 @@ void LAPACK(sgemmt)( #if INCLUDE_DGEMMT void LAPACK(dgemmt)( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { RELAPACK_dgemmt(uplo, n, A, ldA, info); } @@ -585,10 +585,10 @@ void LAPACK(dgemmt)( #if INCLUDE_CGEMMT void LAPACK(cgemmt)( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { RELAPACK_cgemmt(uplo, n, A, ldA, info); } @@ -597,10 +597,10 @@ void LAPACK(cgemmt)( #if INCLUDE_ZGEMMT void LAPACK(zgemmt)( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { RELAPACK_zgemmt(uplo, n, A, ldA, info); } diff --git a/relapack/src/relapack.h b/relapack/src/relapack.h index 38c5c30d0..2cb061c32 100644 --- a/relapack/src/relapack.h +++ b/relapack/src/relapack.h @@ -1,14 +1,6 @@ #ifndef RELAPACK_INT_H #define RELAPACK_INT_H -#include -#include "../../config.h" -#if defined(OS_WINDOWS) && defined(__64BIT__) -typedef long long BLASLONG; -typedef unsigned long long BLASULONG; -#else -typedef long BLASLONG; -typedef unsigned long BLASULONG; -#endif + #include "../config.h" #include "../inc/relapack.h" @@ -46,23 +38,23 @@ typedef unsigned long BLASULONG; #include "blas.h" // sytrf helper routines -void RELAPACK_ssytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_dsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_csytrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_chetrf_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_zsytrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_zhetrf_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_ssytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_dsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_csytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_chetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, float *, const blasint *, blasint *, float *, const blasint *, blasint *); -void RELAPACK_zsytrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); -void RELAPACK_zhetrf_rook_rec2(const char *, const blasint *, const blasint *, blasint *, double *, const blasint *, blasint *, double *, const blasint *, blasint *); +void RELAPACK_ssytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_ssytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_dsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_csytrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_chetrf_rook_rec2(const char *, const int *, const int *, int *, float *, const int *, int *, float *, const int *, int *); +void RELAPACK_zsytrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); +void RELAPACK_zhetrf_rook_rec2(const char *, const int *, const int *, int *, double *, const int *, int *, double *, const int *, int *); // trsyl helper routines -void RELAPACK_strsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); -void RELAPACK_dtrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); -void RELAPACK_ctrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const float *, const blasint *, const float *, const blasint *, float *, const blasint *, float *, blasint *); -void RELAPACK_ztrsyl_rec2(const char *, const char *, const blasint *, const blasint *, const blasint *, const double *, const blasint *, const double *, const blasint *, double *, const blasint *, double *, blasint *); +void RELAPACK_strsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_dtrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); +void RELAPACK_ctrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const float *, const int *, const float *, const int *, float *, const int *, float *, int *); +void RELAPACK_ztrsyl_rec2(const char *, const char *, const int *, const int *, const int *, const double *, const int *, const double *, const int *, double *, const int *, double *, int *); #endif /* RELAPACK_INT_H */ diff --git a/relapack/src/sgbtrf.c b/relapack/src/sgbtrf.c index 3e3fdf455..bc20e744b 100644 --- a/relapack/src/sgbtrf.c +++ b/relapack/src/sgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_sgbtrf_rec(const blasint *, const blasint *, const blasint *, - const blasint *, float *, const blasint *, blasint *, float *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_sgbtrf_rec(const int *, const int *, const int *, + const int *, float *, const int *, int *, float *, const int *, float *, + const int *, int *); /** SGBTRF computes an LU factorization of a real m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,10 +13,11 @@ static void RELAPACK_sgbtrf_rec(const blasint *, const blasint *, const blasint * http://www.netlib.org/lapack/explore-html/d5/d72/sgbtrf_8f.html * */ void RELAPACK_sgbtrf( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + int *info ) { + // Check arguments *info = 0; if (*m < 0) @@ -27,11 +28,11 @@ void RELAPACK_sgbtrf( *info = -3; else if (*ku < 0) *info = -4; - else if (*ldAb < 2 * *kl + *ku + 1) + else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SGBTRF", &minfo, strlen("SGBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("SGBTRF", &minfo); return; } @@ -39,14 +40,14 @@ void RELAPACK_sgbtrf( const float ZERO[] = { 0. }; // Result upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskewg A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Zero upper diagonal fill-in elements - blasint i, j; + int i, j; for (j = 0; j < *n; j++) { float *const A_j = A + *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -54,17 +55,16 @@ void RELAPACK_sgbtrf( } // Allocate work space - const blasint n1 = SREC_SPLIT(*n); - const blasint mWorkl = abs( (kv > n1) ? MAX(1, *m - *kl) : kv ); - const blasint nWorkl = abs( (kv > n1) ? n1 : kv ); - const blasint mWorku = abs( (*kl > n1) ? n1 : *kl ); - const blasint nWorku = abs( (*kl > n1) ? MAX(0, *n - *kl) : *kl ); + const int n1 = SREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; float *Workl = malloc(mWorkl * nWorkl * sizeof(float)); float *Worku = malloc(mWorku * nWorku * sizeof(float)); LAPACK(slaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); LAPACK(slaset)("U", &mWorku, &nWorku, ZERO, ZERO, Worku, &mWorku); - // Recursive kernel RELAPACK_sgbtrf_rec(m, n, kl, ku, Ab, ldAb, ipiv, Workl, &mWorkl, Worku, &mWorku, info); @@ -76,13 +76,12 @@ void RELAPACK_sgbtrf( /** sgbtrf's recursive compute kernel */ static void RELAPACK_sgbtrf_rec( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - float *Ab, const blasint *ldAb, blasint *ipiv, - float *Workl, const blasint *ldWorkl, float *Worku, const blasint *ldWorku, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + float *Ab, const int *ldAb, int *ipiv, + float *Workl, const int *ldWorkl, float *Worku, const int *ldWorku, + int *info ) { - if (*n <= MAX(CROSSOVER_SGBTRF, 1)) { // Unblocked LAPACK(sgbtf2)(m, n, kl, ku, Ab, ldAb, ipiv, info); @@ -92,25 +91,25 @@ static void RELAPACK_sgbtrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterators - blasint i, j; + int i, j; // Output upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + kv; // Splitting - const blasint n1 = MIN(SREC_SPLIT(*n), *kl); - const blasint n2 = *n - n1; - const blasint m1 = MIN(n1, *m); - const blasint m2 = *m - m1; - const blasint mn1 = MIN(m1, n1); - const blasint mn2 = MIN(m2, n2); + const int n1 = MIN(SREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,15 +128,15 @@ static void RELAPACK_sgbtrf_rec( float *const A_BR = A + *ldA * n1 + m1; // ipiv_T - // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + // ipiv_B + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // Banded splitting - const blasint n21 = MIN(n2, kv - n1); - const blasint n22 = MIN(n2 - n21, n1); - const blasint m21 = MIN(m2, *kl - m1); - const blasint m22 = MIN(m2 - m21, m1); + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -157,7 +156,6 @@ static void RELAPACK_sgbtrf_rec( float *const A_BRbl = A_BR + m21; float *const A_BRbr = A_BR + *ldA * n21 + m21; - // recursion(Ab_L, ipiv_T) RELAPACK_sgbtrf_rec(m, &n1, kl, ku, Ab_L, ldAb, ipiv_T, Workl, ldWorkl, Worku, ldWorku, info); @@ -166,7 +164,7 @@ static void RELAPACK_sgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -182,7 +180,7 @@ static void RELAPACK_sgbtrf_rec( for (j = 0; j < n22; j++) { float *const A_Rrj = A_Rr + *ldA * j; for (i = j; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { const float tmp = A_Rrj[i]; A_Rrj[i] = A_Rr[ip]; @@ -210,7 +208,7 @@ static void RELAPACK_sgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(sswap)(&i, A_L + i, ldA, A_L + ip, ldA); @@ -219,11 +217,8 @@ static void RELAPACK_sgbtrf_rec( } } - // recursion(Ab_BR, ipiv_B) -//cause of infinite recursion here ? -// RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(sgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); + RELAPACK_sgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/sgemmt.c b/relapack/src/sgemmt.c index 93438858c..75f78fabd 100644 --- a/relapack/src/sgemmt.c +++ b/relapack/src/sgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_sgemmt_rec(const char *, const char *, const char *, - const blasint *, const blasint *, const float *, const float *, const blasint *, - const float *, const blasint *, const float *, float *, const blasint *); + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, - const blasint *, const blasint *, const float *, const float *, const blasint *, - const float *, const blasint *, const float *, float *, const blasint *); + const int *, const int *, const float *, const float *, const int *, + const float *, const int *, const float *, float *, const int *); /** SGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_sgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_sgemmt( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { #if HAVE_XGEMMT @@ -32,13 +32,13 @@ void RELAPACK_sgemmt( #else // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint notransA = LAPACK(lsame)(transA, "N"); - const blasint tranA = LAPACK(lsame)(transA, "T"); - const blasint notransB = LAPACK(lsame)(transB, "N"); - const blasint tranB = LAPACK(lsame)(transB, "T"); - blasint info = 0; + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + int info = 0; if (!lower && !upper) info = 1; else if (!tranA && !notransA) @@ -56,7 +56,7 @@ void RELAPACK_sgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("SGEMMT", &info, strlen("SGEMMT")); + LAPACK(xerbla)("SGEMMT", &info); return; } @@ -74,10 +74,10 @@ void RELAPACK_sgemmt( /** sgemmt's recursive compute kernel */ static void RELAPACK_sgemmt_rec( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { if (*n <= MAX(CROSSOVER_SGEMMT, 1)) { @@ -87,8 +87,8 @@ static void RELAPACK_sgemmt_rec( } // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; // A_T // A_B @@ -124,16 +124,16 @@ static void RELAPACK_sgemmt_rec( /** sgemmt's unblocked compute kernel */ static void RELAPACK_sgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const float *alpha, const float *A, const blasint *ldA, - const float *B, const blasint *ldB, - const float *beta, float *C, const blasint *ldC + const int *n, const int *k, + const float *alpha, const float *A, const int *ldA, + const float *B, const int *ldB, + const float *beta, float *C, const int *ldC ) { - const blasint incB = (*transB == 'N') ? 1 : *ldB; - const blasint incC = 1; + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; - blasint i; + int i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -149,13 +149,13 @@ static void RELAPACK_sgemmt_rec2( float *const C_ii = C + *ldC * i + i; if (*uplo == 'L') { - const blasint nmi = *n - i; + const int nmi = *n - i; if (*transA == 'N') BLAS(sgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(sgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const blasint ip1 = i + 1; + const int ip1 = i + 1; if (*transA == 'N') BLAS(sgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/sgetrf.c b/relapack/src/sgetrf.c index 0231cc166..284f8cff6 100644 --- a/relapack/src/sgetrf.c +++ b/relapack/src/sgetrf.c @@ -1,6 +1,7 @@ #include "relapack.h" -static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const blasint *, - blasint *, blasint *); + +static void RELAPACK_sgetrf_rec(const int *, const int *, float *, const int *, + int *, int *); /** SGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -10,9 +11,9 @@ static void RELAPACK_sgetrf_rec(const blasint *, const blasint *, float *, const * http://www.netlib.org/lapack/explore-html/de/de2/sgetrf_8f.html * */ void RELAPACK_sgetrf( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { // Check arguments @@ -21,24 +22,26 @@ void RELAPACK_sgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *m)) + else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SGETRF", &minfo, strlen("SGETRF")); + const int minfo = -*info; + LAPACK(xerbla)("SGETRF", &minfo); return; } - const blasint sn = MIN(*m, *n); + + const int sn = MIN(*m, *n); + RELAPACK_sgetrf_rec(m, &sn, A, ldA, ipiv, info); // Right remainder if (*m < *n) { // Constants const float ONE[] = { 1. }; - const blasint iONE[] = { 1. }; + const int iONE[] = { 1. }; // Splitting - const blasint rn = *n - *m; + const int rn = *n - *m; // A_L A_R const float *const A_L = A; @@ -54,10 +57,11 @@ void RELAPACK_sgetrf( /** sgetrf's recursive compute kernel */ static void RELAPACK_sgetrf_rec( - const blasint *m, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + float *A, const int *ldA, int *ipiv, + int *info ) { + if (*n <= MAX(CROSSOVER_SGETRF, 1)) { // Unblocked LAPACK(sgetf2)(m, n, A, ldA, ipiv, info); @@ -67,12 +71,13 @@ static void RELAPACK_sgetrf_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; - const blasint m2 = *m - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; + // A_L A_R float *const A_L = A; float *const A_R = A + *ldA * n1; @@ -86,8 +91,8 @@ static void RELAPACK_sgetrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_sgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -106,7 +111,7 @@ static void RELAPACK_sgetrf_rec( // apply pivots to A_BL LAPACK(slaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/slauum.c b/relapack/src/slauum.c index 79212817f..280f141b3 100644 --- a/relapack/src/slauum.c +++ b/relapack/src/slauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_slauum_rec(const char *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_slauum_rec(const char *, const int *, float *, + const int *, int *); /** SLAUUM computes the product U * U**T or L**T * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_slauum_rec(const char *, const blasint *, float *, * http://www.netlib.org/lapack/explore-html/dd/d5a/slauum_8f.html * */ void RELAPACK_slauum( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_slauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SLAUUM", &minfo, strlen("SLAUUM")); + const int minfo = -*info; + LAPACK(xerbla)("SLAUUM", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_slauum( /** slauum's recursive compute kernel */ static void RELAPACK_slauum_rec( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_SLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_slauum_rec( const float ONE[] = { 1. }; // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/spbtrf.c b/relapack/src/spbtrf.c index 26804dcc2..ee0a5546e 100644 --- a/relapack/src/spbtrf.c +++ b/relapack/src/spbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_spbtrf_rec(const char *, const blasint *, const blasint *, - float *, const blasint *, float *, const blasint *, blasint *); +static void RELAPACK_spbtrf_rec(const char *, const int *, const int *, + float *, const int *, float *, const int *, int *); /** SPBTRF computes the Cholesky factorization of a real symmetric positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_spbtrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/d1/d22/spbtrf_8f.html * */ void RELAPACK_spbtrf( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_spbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SPBTRF", &minfo, strlen("SPBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("SPBTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_spbtrf( const float ZERO[] = { 0. }; // Allocate work space - const blasint n1 = SREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const int n1 = SREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; float *Work = malloc(mWork * nWork * sizeof(float)); LAPACK(slaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_spbtrf( /** spbtrf's recursive compute kernel */ static void RELAPACK_spbtrf_rec( - const char *uplo, const blasint *n, const blasint *kd, - float *Ab, const blasint *ldAb, - float *Work, const blasint *ldWork, - blasint *info + const char *uplo, const int *n, const int *kd, + float *Ab, const int *ldAb, + float *Work, const int *ldWork, + int *info ){ if (*n <= MAX(CROSSOVER_SPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_spbtrf_rec( const float MONE[] = { -1. }; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; float *const A = Ab + ((*uplo == 'L') ? 0 : *kd); // Splitting - const blasint n1 = MIN(SREC_SPLIT(*n), *kd); - const blasint n2 = *n - n1; + const int n1 = MIN(SREC_SPLIT(*n), *kd); + const int n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_spbtrf_rec( return; // Banded splitting - const blasint n21 = MIN(n2, *kd - n1); - const blasint n22 = MIN(n2 - n21, *kd); + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/spotrf.c b/relapack/src/spotrf.c index b22e917f7..2a609321b 100644 --- a/relapack/src/spotrf.c +++ b/relapack/src/spotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_spotrf_rec(const char *, const blasint *, float *, - const blasint *, blasint *); +static void RELAPACK_spotrf_rec(const char *, const int *, float *, + const int *, int *); /** SPOTRF computes the Cholesky factorization of a real symmetric positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_spotrf_rec(const char *, const blasint *, float *, * http://www.netlib.org/lapack/explore-html/d0/da2/spotrf_8f.html * */ void RELAPACK_spotrf( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_spotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SPOTRF", &minfo, strlen("SPOTRF")); + const int minfo = -*info; + LAPACK(xerbla)("SPOTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_spotrf( /** spotrf's recursive compute kernel */ static void RELAPACK_spotrf_rec( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_SPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_spotrf_rec( const float MONE[] = { -1. }; // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssygst.c b/relapack/src/ssygst.c index 4259f9031..7f145cdec 100644 --- a/relapack/src/ssygst.c +++ b/relapack/src/ssygst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_ssygst_rec(const blasint *, const char *, const blasint *, - float *, const blasint *, const float *, const blasint *, - float *, const blasint *, blasint *); +static void RELAPACK_ssygst_rec(const int *, const char *, const int *, + float *, const int *, const float *, const int *, + float *, const int *, int *); /** SSYGST reduces a real symmetric-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_ssygst_rec(const blasint *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d8/d78/ssygst_8f.html * */ void RELAPACK_ssygst( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_ssygst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SSYGST", &minfo, strlen("SSYGST")); + const int minfo = -*info; + LAPACK(xerbla)("SSYGST", &minfo); return; } @@ -45,9 +45,9 @@ void RELAPACK_ssygst( // Allocate work space float *Work = NULL; - blasint lWork = 0; + int lWork = 0; #if XSYGST_ALLOW_MALLOC - const blasint n1 = SREC_SPLIT(*n); + const int n1 = SREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * sizeof(float)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_ssygst( /** ssygst's recursive compute kernel */ static void RELAPACK_ssygst_rec( - const blasint *itype, const char *uplo, const blasint *n, - float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *Work, const blasint *lWork, blasint *info + const int *itype, const char *uplo, const int *n, + float *A, const int *ldA, const float *B, const int *ldB, + float *Work, const int *lWork, int *info ) { if (*n <= MAX(CROSSOVER_SSYGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_ssygst_rec( const float MONE[] = { -1. }; const float HALF[] = { .5 }; const float MHALF[] = { -.5 }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/ssytrf.c b/relapack/src/ssytrf.c index 9fe7ce4a6..8a4fad9f2 100644 --- a/relapack/src/ssytrf.c +++ b/relapack/src/ssytrf.c @@ -2,8 +2,9 @@ #if XSYTRF_ALLOW_MALLOC #include #endif -static void RELAPACK_ssytrf_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); + +static void RELAPACK_ssytrf_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** SSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -13,21 +14,21 @@ static void RELAPACK_ssytrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/da/de9/ssytrf_8f.html * */ void RELAPACK_ssytrf( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -54,8 +55,8 @@ void RELAPACK_ssytrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); return; } @@ -63,7 +64,7 @@ void RELAPACK_ssytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - blasint nout; + int nout; // Recursive kernel RELAPACK_ssytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -77,13 +78,13 @@ void RELAPACK_ssytrf( /** ssytrf's recursive compute kernel */ static void RELAPACK_ssytrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF, 3)) { // Unblocked @@ -95,34 +96,34 @@ static void RELAPACK_ssytrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = SREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_ssytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -138,23 +139,23 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_ssytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -181,22 +182,22 @@ static void RELAPACK_ssytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = SREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_ssytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -212,19 +213,19 @@ static void RELAPACK_ssytrf_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_ssytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rec2.c b/relapack/src/ssytrf_rec2.c index 13856f064..edc9269ec 100644 --- a/relapack/src/ssytrf_rec2.c +++ b/relapack/src/ssytrf_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; static float c_b8 = -1.f; static float c_b9 = 1.f; @@ -25,32 +25,32 @@ static float c_b9 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float *w, - int *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rec2(char *uplo, int *n, int * + nb, int *kb, float *a, int *lda, int *ipiv, float *w, + int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1, r__2, r__3; /* Builtin functions */ double sqrt(double); /* Local variables */ - static blasint j, k; + static int j, k; static float t, r1, d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *), - sgemv_(char *, blasint *, blasint *, float *, float *, blasint *, - float *, blasint *, float *, float *, blasint *, ftnlen); - static blasint kstep; - extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, - blasint *), sswap_(int *, float *, blasint *, float *, blasint * + extern /* Subroutine */ int sscal_(int *, float *, float *, int *), + sgemv_(char *, int *, int *, float *, float *, int *, + float *, int *, float *, float *, int *, ftnlen); + static int kstep; + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * ); static float absakk; - extern blasint isamax_(int *, float *, blasint *); + extern int isamax_(int *, float *, int *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ssytrf_rook.c b/relapack/src/ssytrf_rook.c index abcf29d1c..040df2484 100644 --- a/relapack/src/ssytrf_rook.c +++ b/relapack/src/ssytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_ssytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - float *, const blasint *, blasint *, float *, const blasint *, blasint *); +static void RELAPACK_ssytrf_rook_rec(const char *, const int *, const int *, int *, + float *, const int *, int *, float *, const int *, int *); /** SSYTRF_ROOK computes the factorization of a real symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_ssytrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/de/da4/ssytrf__rook_8f.html * */ void RELAPACK_ssytrf_rook( - const char *uplo, const blasint *n, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + float *A, const int *ldA, int *ipiv, + float *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_ssytrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("SSYTRF", &minfo, strlen("SSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("SSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_ssytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_ssytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_ssytrf_rook( /** ssytrf_rook's recursive compute kernel */ static void RELAPACK_ssytrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - float *A, const blasint *ldA, blasint *ipiv, - float *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + float *A, const int *ldA, int *ipiv, + float *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_SSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_ssytrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = SREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = SREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * float *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: use Work as Work_BR) float *const Work_BL = Work + n1; float *const Work_BR = top ? Work : Work + *ldWork * n1 + n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_sgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(sgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR float *const A_BR_r = A_BR + *ldA * n2_out + n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_ssytrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_ssytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = SREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = SREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) float *const Work_R = top ? Work : Work + *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_ssytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_ssytrf_rook_rec( // (top recursion level: Work_R was Work) float *const Work_L = Work; float *const Work_TR = Work + *ldWork * (top ? n2_diff : n1) + n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_sgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(sgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_ssytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(sgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/ssytrf_rook_rec2.c b/relapack/src/ssytrf_rook_rec2.c index 41659cb3e..3308826d7 100644 --- a/relapack/src/ssytrf_rook_rec2.c +++ b/relapack/src/ssytrf_rook_rec2.c @@ -14,7 +14,7 @@ /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; static float c_b9 = -1.f; static float c_b10 = 1.f; @@ -25,39 +25,39 @@ static float c_b10 = 1.f; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, float *a, blasint *lda, blasint *ipiv, float * - w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_ssytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, float *a, int *lda, int *ipiv, float * + w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2; float r__1; /* Builtin functions */ double sqrt(double); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static float t, r1, d11, d12, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static float alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); static float sfmin; - static blasint itemp; - extern /* Subroutine */ blasint sgemv_(char *, blasint *, blasint *, float *, - float *, blasint *, float *, blasint *, float *, float *, blasint *, + static int itemp; + extern /* Subroutine */ int sgemv_(char *, int *, int *, float *, + float *, int *, float *, int *, float *, float *, int *, ftnlen); - static blasint kstep; + static int kstep; static float stemp; - extern /* Subroutine */ blasint scopy_(int *, float *, blasint *, float *, - blasint *), sswap_(int *, float *, blasint *, float *, blasint * + extern /* Subroutine */ int scopy_(int *, float *, int *, float *, + int *), sswap_(int *, float *, int *, float *, int * ); static float absakk; extern double slamch_(char *, ftnlen); - extern blasint isamax_(int *, float *, blasint *); + extern int isamax_(int *, float *, int *); static float colmax, rowmax; /* Parameter adjustments */ diff --git a/relapack/src/stgsyl.c b/relapack/src/stgsyl.c index 6bace9f17..1870fb928 100644 --- a/relapack/src/stgsyl.c +++ b/relapack/src/stgsyl.c @@ -1,11 +1,11 @@ #include "relapack.h" #include -static void RELAPACK_stgsyl_rec(const char *, const blasint *, const blasint *, - const blasint *, const float *, const blasint *, const float *, const blasint *, - float *, const blasint *, const float *, const blasint *, const float *, - const blasint *, float *, const blasint *, float *, float *, float *, blasint *, blasint *, - blasint *); +static void RELAPACK_stgsyl_rec(const char *, const int *, const int *, + const int *, const float *, const int *, const float *, const int *, + float *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, float *, float *, int *, int *, + int *); /** STGSYL solves the generalized Sylvester equation. @@ -15,21 +15,21 @@ static void RELAPACK_stgsyl_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/dc/d67/stgsyl_8f.html * */ void RELAPACK_stgsyl( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dif, - float *Work, const blasint *lWork, blasint *iWork, blasint *info + float *Work, const int *lWork, int *iWork, int *info ) { // Parse arguments - const blasint notran = LAPACK(lsame)(trans, "N"); - const blasint tran = LAPACK(lsame)(trans, "T"); + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "T"); // Compute work buffer size - blasint lwmin = 1; + int lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -58,8 +58,8 @@ void RELAPACK_stgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("STGSYL", &minfo, strlen("STGSYL")); + const int minfo = -*info; + LAPACK(xerbla)("STGSYL", &minfo); return; } @@ -75,8 +75,8 @@ void RELAPACK_stgsyl( // Constant const float ZERO[] = { 0. }; - blasint isolve = 1; - blasint ifunc = 0; + int isolve = 1; + int ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -87,12 +87,12 @@ void RELAPACK_stgsyl( } float scale2; - blasint iround; + int iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; float dscale = 0; float dsum = 1; - blasint pq; + int pq; RELAPACK_stgsyl_rec(&cleantrans, &ifunc, m, n, A, ldA, B, ldB, C, ldC, D, ldD, E, ldE, F, ldF, scale, &dsum, &dscale, iWork, &pq, info); if (dscale != 0) { if (*ijob == 1 || *ijob == 3) @@ -121,13 +121,13 @@ void RELAPACK_stgsyl( /** stgsyl's recursive vompute kernel */ static void RELAPACK_stgsyl_rec( - const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, - const float *D, const blasint *ldD, const float *E, const blasint *ldE, - float *F, const blasint *ldF, + const char *trans, const int *ifunc, const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, + const float *D, const int *ldD, const float *E, const int *ldE, + float *F, const int *ldF, float *scale, float *dsum, float *dscale, - blasint *iWork, blasint *pq, blasint *info + int *iWork, int *pq, int *info ) { if (*m <= MAX(CROSSOVER_STGSYL, 1) && *n <= MAX(CROSSOVER_STGSYL, 1)) { @@ -139,20 +139,20 @@ static void RELAPACK_stgsyl_rec( // Constants const float ONE[] = { 1. }; const float MONE[] = { -1. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - blasint m1 = SREC_SPLIT(*m); + int m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const blasint m2 = *m - m1; + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -210,10 +210,10 @@ static void RELAPACK_stgsyl_rec( } } else { // Splitting - blasint n1 = SREC_SPLIT(*n); + int n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const blasint n2 = *n - n1; + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl.c b/relapack/src/strsyl.c index 012fb3548..83947ef1a 100644 --- a/relapack/src/strsyl.c +++ b/relapack/src/strsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_strsyl_rec(const char *, const char *, const blasint *, - const blasint *, const blasint *, const float *, const blasint *, const float *, - const blasint *, float *, const blasint *, float *, blasint *); +static void RELAPACK_strsyl_rec(const char *, const char *, const int *, + const int *, const int *, const float *, const int *, const float *, + const int *, float *, const int *, float *, int *); /** STRSYL solves the real Sylvester matrix equation. @@ -12,20 +12,20 @@ static void RELAPACK_strsyl_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d4/d7d/strsyl_8f.html * */ void RELAPACK_strsyl( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { // Check arguments - const blasint notransA = LAPACK(lsame)(tranA, "N"); - const blasint transA = LAPACK(lsame)(tranA, "T"); - const blasint ctransA = LAPACK(lsame)(tranA, "C"); - const blasint notransB = LAPACK(lsame)(tranB, "N"); - const blasint transB = LAPACK(lsame)(tranB, "T"); - const blasint ctransB = LAPACK(lsame)(tranB, "C"); + const int notransA = LAPACK(lsame)(tranA, "N"); + const int transA = LAPACK(lsame)(tranA, "T"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int transB = LAPACK(lsame)(tranB, "T"); + const int ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!transA && !ctransA && !notransA) *info = -1; @@ -44,8 +44,8 @@ void RELAPACK_strsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("STRSYL", &minfo, strlen("STRSYL")); + const int minfo = -*info; + LAPACK(xerbla)("STRSYL", &minfo); return; } @@ -60,11 +60,11 @@ void RELAPACK_strsyl( /** strsyl's recursive compute kernel */ static void RELAPACK_strsyl_rec( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const float *A, const blasint *ldA, const float *B, const blasint *ldB, - float *C, const blasint *ldC, float *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const float *A, const int *ldA, const float *B, const int *ldB, + float *C, const int *ldC, float *scale, + int *info ) { if (*m <= MAX(CROSSOVER_STRSYL, 1) && *n <= MAX(CROSSOVER_STRSYL, 1)) { @@ -77,20 +77,20 @@ static void RELAPACK_strsyl_rec( const float ONE[] = { 1. }; const float MONE[] = { -1. }; const float MSGN[] = { -*isgn }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs float scale1[] = { 1. }; float scale2[] = { 1. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - blasint m1 = SREC_SPLIT(*m); + int m1 = SREC_SPLIT(*m); if (A[m1 + *ldA * (m1 - 1)]) m1++; - const blasint m2 = *m - m1; + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -126,10 +126,10 @@ static void RELAPACK_strsyl_rec( } } else { // Splitting - blasint n1 = SREC_SPLIT(*n); + int n1 = SREC_SPLIT(*n); if (B[n1 + *ldB * (n1 - 1)]) n1++; - const blasint n2 = *n - n1; + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/strsyl_rec2.c b/relapack/src/strsyl_rec2.c index 37a24c7dc..6d40a475d 100644 --- a/relapack/src/strsyl_rec2.c +++ b/relapack/src/strsyl_rec2.c @@ -14,48 +14,48 @@ /* Table of constant values */ -static blasint c__1 = 1; -static blasint c_false = FALSE_; -static blasint c__2 = 2; +static int c__1 = 1; +static int c_false = FALSE_; +static int c__2 = 2; static float c_b26 = 1.f; static float c_b30 = 0.f; -static blasint c_true = TRUE_; +static int c_true = TRUE_; -void RELAPACK_strsyl_rec2(char *trana, char *tranb, blasint *isgn, int - *m, blasint *n, float *a, blasint *lda, float *b, blasint *ldb, float * - c__, blasint *ldc, float *scale, blasint *info, ftnlen trana_len, +void RELAPACK_strsyl_rec2(char *trana, char *tranb, int *isgn, int + *m, int *n, float *a, int *lda, float *b, int *ldb, float * + c__, int *ldc, float *scale, int *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; float r__1, r__2; /* Local variables */ - static blasint j, k, l; + static int j, k, l; static float x[4] /* was [2][2] */; - static blasint k1, k2, l1, l2; + static int k1, k2, l1, l2; static float a11, db, da11, vec[4] /* was [2][2] */, dum[1], eps, sgn; - static blasint ierr; + static int ierr; static float smin; - extern float sdot_(int *, float *, blasint *, float *, blasint *); + extern float sdot_(int *, float *, int *, float *, int *); static float suml, sumr; - extern blasint lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint sscal_(int *, float *, float *, blasint *); - static blasint knext, lnext; + extern int lsame_(char *, char *, ftnlen, ftnlen); + extern /* Subroutine */ int sscal_(int *, float *, float *, int *); + static int knext, lnext; static float xnorm; - extern /* Subroutine */ blasint slaln2_(int *, blasint *, blasint *, float - *, float *, float *, blasint *, float *, float *, float *, blasint *, - float *, float *, float *, blasint *, float *, float *, blasint *), - slasy2_(int *, blasint *, blasint *, blasint *, blasint *, - float *, blasint *, float *, blasint *, float *, blasint *, float *, - float *, blasint *, float *, blasint *), slabad_(float *, float *); + extern /* Subroutine */ int slaln2_(int *, int *, int *, float + *, float *, float *, int *, float *, float *, float *, int *, + float *, float *, float *, int *, float *, float *, int *), + slasy2_(int *, int *, int *, int *, int *, + float *, int *, float *, int *, float *, int *, float *, + float *, int *, float *, int *), slabad_(float *, float *); static float scaloc; - extern float slamch_(char *, ftnlen), slange_(char *, blasint *, - blasint *, float *, blasint *, float *, ftnlen); - extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); + extern float slamch_(char *, ftnlen), slange_(char *, int *, + int *, float *, int *, float *, ftnlen); + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); static float bignum; - static blasint notrna, notrnb; + static int notrna, notrnb; static float smlnum; /* Parameter adjustments */ diff --git a/relapack/src/strtri.c b/relapack/src/strtri.c index 18d11f5eb..d35bbd49f 100644 --- a/relapack/src/strtri.c +++ b/relapack/src/strtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_strtri_rec(const char *, const char *, const blasint *, - float *, const blasint *, blasint *); +static void RELAPACK_strtri_rec(const char *, const char *, const int *, + float *, const int *, int *); /** CTRTRI computes the inverse of a real upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_strtri_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/de/d76/strtri_8f.html * */ void RELAPACK_strtri( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint nounit = LAPACK(lsame)(diag, "N"); - const blasint unit = LAPACK(lsame)(diag, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_strtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("STRTRI", &minfo, strlen("STRTRI")); + const int minfo = -*info; + LAPACK(xerbla)("STRTRI", &minfo); return; } @@ -42,7 +42,7 @@ void RELAPACK_strtri( // check for singularity if (nounit) { - blasint i; + int i; for (i = 0; i < *n; i++) if (A[i + *ldA * i] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_strtri( /** strtri's recursive compute kernel */ static void RELAPACK_strtri_rec( - const char *uplo, const char *diag, const blasint *n, - float *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + float *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_STRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_strtri_rec( const float MONE[] = { -1. }; // Splitting - const blasint n1 = SREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = SREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zgbtrf.c b/relapack/src/zgbtrf.c index d4ba41753..3aa6bf531 100644 --- a/relapack/src/zgbtrf.c +++ b/relapack/src/zgbtrf.c @@ -1,9 +1,9 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zgbtrf_rec(const blasint *, const blasint *, const blasint *, - const blasint *, double *, const blasint *, blasint *, double *, const blasint *, double *, - const blasint *, blasint *); +static void RELAPACK_zgbtrf_rec(const int *, const int *, const int *, + const int *, double *, const int *, int *, double *, const int *, double *, + const int *, int *); /** ZGBTRF computes an LU factorization of a complex m-by-n band matrix A using partial pivoting with row interchanges. @@ -13,9 +13,9 @@ static void RELAPACK_zgbtrf_rec(const blasint *, const blasint *, const blasint * http://www.netlib.org/lapack/explore-html/dc/dcb/zgbtrf_8f.html * */ void RELAPACK_zgbtrf( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + int *info ) { // Check arguments @@ -31,8 +31,8 @@ void RELAPACK_zgbtrf( else if (*ldAb < 2 * *kl + *ku + 1) *info = -6; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZGBTRF", &minfo, strlen("ZGBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZGBTRF", &minfo); return; } @@ -40,14 +40,14 @@ void RELAPACK_zgbtrf( const double ZERO[] = { 0., 0. }; // Result upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Zero upper diagonal fill-in elements - blasint i, j; + int i, j; for (j = 0; j < *n; j++) { double *const A_j = A + 2 * *ldA * j; for (i = MAX(0, j - kv); i < j - *ku; i++) @@ -55,11 +55,11 @@ void RELAPACK_zgbtrf( } // Allocate work space - const blasint n1 = ZREC_SPLIT(*n); - const blasint mWorkl = abs ( (kv > n1) ? MAX(1, *m - *kl) : kv); - const blasint nWorkl = abs ( (kv > n1) ? n1 : kv); - const blasint mWorku = abs ( (*kl > n1) ? n1 : *kl); - const blasint nWorku = abs ( (*kl > n1) ? MAX(0, *n - *kl) : *kl); + const int n1 = ZREC_SPLIT(*n); + const int mWorkl = (kv > n1) ? MAX(1, *m - *kl) : kv; + const int nWorkl = (kv > n1) ? n1 : kv; + const int mWorku = (*kl > n1) ? n1 : *kl; + const int nWorku = (*kl > n1) ? MAX(0, *n - *kl) : *kl; double *Workl = malloc(mWorkl * nWorkl * 2 * sizeof(double)); double *Worku = malloc(mWorku * nWorku * 2 * sizeof(double)); LAPACK(zlaset)("L", &mWorkl, &nWorkl, ZERO, ZERO, Workl, &mWorkl); @@ -76,10 +76,10 @@ void RELAPACK_zgbtrf( /** zgbtrf's recursive compute kernel */ static void RELAPACK_zgbtrf_rec( - const blasint *m, const blasint *n, const blasint *kl, const blasint *ku, - double *Ab, const blasint *ldAb, blasint *ipiv, - double *Workl, const blasint *ldWorkl, double *Worku, const blasint *ldWorku, - blasint *info + const int *m, const int *n, const int *kl, const int *ku, + double *Ab, const int *ldAb, int *ipiv, + double *Workl, const int *ldWorkl, double *Worku, const int *ldWorku, + int *info ) { if (*n <= MAX(CROSSOVER_ZGBTRF, 1)) { @@ -91,25 +91,25 @@ static void RELAPACK_zgbtrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterators - blasint i, j; + int i, j; // Output upper band width - const blasint kv = *ku + *kl; + const int kv = *ku + *kl; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * kv; // Splitting - const blasint n1 = MIN(ZREC_SPLIT(*n), *kl); - const blasint n2 = *n - n1; - const blasint m1 = MIN(n1, *m); - const blasint m2 = *m - m1; - const blasint mn1 = MIN(m1, n1); - const blasint mn2 = MIN(m2, n2); + const int n1 = MIN(ZREC_SPLIT(*n), *kl); + const int n2 = *n - n1; + const int m1 = MIN(n1, *m); + const int m2 = *m - m1; + const int mn1 = MIN(m1, n1); + const int mn2 = MIN(m2, n2); // Ab_L * // Ab_BR @@ -129,14 +129,14 @@ static void RELAPACK_zgbtrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // Banded splitting - const blasint n21 = MIN(n2, kv - n1); - const blasint n22 = MIN(n2 - n21, n1); - const blasint m21 = MIN(m2, *kl - m1); - const blasint m22 = MIN(m2 - m21, m1); + const int n21 = MIN(n2, kv - n1); + const int n22 = MIN(n2 - n21, n1); + const int m21 = MIN(m2, *kl - m1); + const int m22 = MIN(m2 - m21, m1); // n1 n21 n22 // m * A_Rl ARr @@ -164,7 +164,7 @@ static void RELAPACK_zgbtrf_rec( // partially redo swaps in A_L for (i = 0; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -180,7 +180,7 @@ static void RELAPACK_zgbtrf_rec( for (j = 0; j < n22; j++) { double *const A_Rrj = A_Rr + 2 * *ldA * j; for (i = j; i < mn1; i++) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { const double tmpr = A_Rrj[2 * i]; const double tmpc = A_Rrj[2 * i + 1]; @@ -211,7 +211,7 @@ static void RELAPACK_zgbtrf_rec( // partially undo swaps in A_L for (i = mn1 - 1; i >= 0; i--) { - const blasint ip = ipiv_T[i] - 1; + const int ip = ipiv_T[i] - 1; if (ip != i) { if (ip < *kl) BLAS(zswap)(&i, A_L + 2 * i, ldA, A_L + 2 * ip, ldA); @@ -221,9 +221,7 @@ static void RELAPACK_zgbtrf_rec( } // recursion(Ab_BR, ipiv_B) - // RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); - LAPACK(zgbtf2)(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, info); - + RELAPACK_zgbtrf_rec(&m2, &n2, kl, ku, Ab_BR, ldAb, ipiv_B, Workl, ldWorkl, Worku, ldWorku, info); if (*info) *info += n1; // shift pivots diff --git a/relapack/src/zgemmt.c b/relapack/src/zgemmt.c index f53a3ca6f..aa5930238 100644 --- a/relapack/src/zgemmt.c +++ b/relapack/src/zgemmt.c @@ -1,12 +1,12 @@ #include "relapack.h" static void RELAPACK_zgemmt_rec(const char *, const char *, const char *, - const blasint *, const blasint *, const double *, const double *, const blasint *, - const double *, const blasint *, const double *, double *, const blasint *); + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, - const blasint *, const blasint *, const double *, const double *, const blasint *, - const double *, const blasint *, const double *, double *, const blasint *); + const int *, const int *, const double *, const double *, const int *, + const double *, const int *, const double *, double *, const int *); /** ZGEMMT computes a matrix-matrix product with general matrices but updates @@ -20,10 +20,10 @@ static void RELAPACK_zgemmt_rec2(const char *, const char *, const char *, * */ void RELAPACK_zgemmt( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { #if HAVE_XGEMMT @@ -32,15 +32,15 @@ void RELAPACK_zgemmt( #else // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint notransA = LAPACK(lsame)(transA, "N"); - const blasint tranA = LAPACK(lsame)(transA, "T"); - const blasint ctransA = LAPACK(lsame)(transA, "C"); - const blasint notransB = LAPACK(lsame)(transB, "N"); - const blasint tranB = LAPACK(lsame)(transB, "T"); - const blasint ctransB = LAPACK(lsame)(transB, "C"); - blasint info = 0; + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int notransA = LAPACK(lsame)(transA, "N"); + const int tranA = LAPACK(lsame)(transA, "T"); + const int ctransA = LAPACK(lsame)(transA, "C"); + const int notransB = LAPACK(lsame)(transB, "N"); + const int tranB = LAPACK(lsame)(transB, "T"); + const int ctransB = LAPACK(lsame)(transB, "C"); + int info = 0; if (!lower && !upper) info = 1; else if (!tranA && !ctransA && !notransA) @@ -58,7 +58,7 @@ void RELAPACK_zgemmt( else if (*ldC < MAX(1, *n)) info = 13; if (info) { - LAPACK(xerbla)("ZGEMMT", &info, strlen("ZGEMMT")); + LAPACK(xerbla)("ZGEMMT", &info); return; } @@ -76,10 +76,10 @@ void RELAPACK_zgemmt( /** zgemmt's recursive compute kernel */ static void RELAPACK_zgemmt_rec( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { if (*n <= MAX(CROSSOVER_ZGEMMT, 1)) { @@ -89,8 +89,8 @@ static void RELAPACK_zgemmt_rec( } // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // A_T // A_B @@ -126,16 +126,16 @@ static void RELAPACK_zgemmt_rec( /** zgemmt's unblocked compute kernel */ static void RELAPACK_zgemmt_rec2( const char *uplo, const char *transA, const char *transB, - const blasint *n, const blasint *k, - const double *alpha, const double *A, const blasint *ldA, - const double *B, const blasint *ldB, - const double *beta, double *C, const blasint *ldC + const int *n, const int *k, + const double *alpha, const double *A, const int *ldA, + const double *B, const int *ldB, + const double *beta, double *C, const int *ldC ) { - const blasint incB = (*transB == 'N') ? 1 : *ldB; - const blasint incC = 1; + const int incB = (*transB == 'N') ? 1 : *ldB; + const int incC = 1; - blasint i; + int i; for (i = 0; i < *n; i++) { // A_0 // A_i @@ -151,13 +151,13 @@ static void RELAPACK_zgemmt_rec2( double *const C_ii = C + 2 * *ldC * i + 2 * i; if (*uplo == 'L') { - const blasint nmi = *n - i; + const int nmi = *n - i; if (*transA == 'N') BLAS(zgemv)(transA, &nmi, k, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); else BLAS(zgemv)(transA, k, &nmi, alpha, A_i, ldA, B_i, &incB, beta, C_ii, &incC); } else { - const blasint ip1 = i + 1; + const int ip1 = i + 1; if (*transA == 'N') BLAS(zgemv)(transA, &ip1, k, alpha, A_0, ldA, B_i, &incB, beta, C_0i, &incC); else diff --git a/relapack/src/zgetrf.c b/relapack/src/zgetrf.c index b0d14ffb1..cf8921e1f 100644 --- a/relapack/src/zgetrf.c +++ b/relapack/src/zgetrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zgetrf_rec(const blasint *, const blasint *, double *, - const blasint *, blasint *, blasint *); +static void RELAPACK_zgetrf_rec(const int *, const int *, double *, + const int *, int *, int *); /** ZGETRF computes an LU factorization of a general M-by-N matrix A using partial pivoting with row interchanges. @@ -11,9 +11,9 @@ static void RELAPACK_zgetrf_rec(const blasint *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/dd/dd1/zgetrf_8f.html * */ void RELAPACK_zgetrf( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { // Check arguments @@ -22,15 +22,15 @@ void RELAPACK_zgetrf( *info = -1; else if (*n < 0) *info = -2; - else if (*ldA < MAX(1, *m)) + else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZGETRF", &minfo, strlen("ZGETRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZGETRF", &minfo); return; } - const blasint sn = MIN(*m, *n); + const int sn = MIN(*m, *n); RELAPACK_zgetrf_rec(m, &sn, A, ldA, ipiv, info); @@ -38,10 +38,10 @@ void RELAPACK_zgetrf( if (*m < *n) { // Constants const double ONE[] = { 1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Splitting - const blasint rn = *n - *m; + const int rn = *n - *m; // A_L A_R const double *const A_L = A; @@ -57,9 +57,9 @@ void RELAPACK_zgetrf( /** zgetrf's recursive compute kernel */ static void RELAPACK_zgetrf_rec( - const blasint *m, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - blasint *info + const int *m, const int *n, + double *A, const int *ldA, int *ipiv, + int *info ) { if (*n <= MAX(CROSSOVER_ZGETRF, 1)) { @@ -71,12 +71,12 @@ static void RELAPACK_zgetrf_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1. }; + const int iONE[] = { 1. }; // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; - const blasint m2 = *m - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; + const int m2 = *m - n1; // A_L A_R double *const A_L = A; @@ -91,8 +91,8 @@ static void RELAPACK_zgetrf_rec( // ipiv_T // ipiv_B - blasint *const ipiv_T = ipiv; - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_T = ipiv; + int *const ipiv_B = ipiv + n1; // recursion(A_L, ipiv_T) RELAPACK_zgetrf_rec(m, &n1, A_L, ldA, ipiv_T, info); @@ -111,7 +111,7 @@ static void RELAPACK_zgetrf_rec( // apply pivots to A_BL LAPACK(zlaswp)(&n1, A_BL, ldA, iONE, &n2, ipiv_B, iONE); // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) ipiv_B[i] += n1; } diff --git a/relapack/src/zhegst.c b/relapack/src/zhegst.c index dc9b7eace..d0ece2148 100644 --- a/relapack/src/zhegst.c +++ b/relapack/src/zhegst.c @@ -3,9 +3,9 @@ #include "stdlib.h" #endif -static void RELAPACK_zhegst_rec(const blasint *, const char *, const blasint *, - double *, const blasint *, const double *, const blasint *, - double *, const blasint *, blasint *); +static void RELAPACK_zhegst_rec(const int *, const char *, const int *, + double *, const int *, const double *, const int *, + double *, const int *, int *); /** ZHEGST reduces a complex Hermitian-definite generalized eigenproblem to standard form. @@ -15,14 +15,14 @@ static void RELAPACK_zhegst_rec(const blasint *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/dc/d68/zhegst_8f.html * */ void RELAPACK_zhegst( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (*itype < 1 || *itype > 3) *info = -1; @@ -35,8 +35,8 @@ void RELAPACK_zhegst( else if (*ldB < MAX(1, *n)) *info = -7; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZHEGST", &minfo, strlen("ZHEGST")); + const int minfo = -*info; + LAPACK(xerbla)("ZHEGST", &minfo); return; } @@ -45,9 +45,9 @@ void RELAPACK_zhegst( // Allocate work space double *Work = NULL; - blasint lWork = 0; + int lWork = 0; #if XSYGST_ALLOW_MALLOC - const blasint n1 = ZREC_SPLIT(*n); + const int n1 = ZREC_SPLIT(*n); lWork = n1 * (*n - n1); Work = malloc(lWork * 2 * sizeof(double)); if (!Work) @@ -67,9 +67,9 @@ void RELAPACK_zhegst( /** zhegst's recursive compute kernel */ static void RELAPACK_zhegst_rec( - const blasint *itype, const char *uplo, const blasint *n, - double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *Work, const blasint *lWork, blasint *info + const int *itype, const char *uplo, const int *n, + double *A, const int *ldA, const double *B, const int *ldB, + double *Work, const int *lWork, int *info ) { if (*n <= MAX(CROSSOVER_ZHEGST, 1)) { @@ -84,14 +84,14 @@ static void RELAPACK_zhegst_rec( const double MONE[] = { -1., 0. }; const double HALF[] = { .5, 0. }; const double MHALF[] = { -.5, 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zhetrf.c b/relapack/src/zhetrf.c index 3d458fecf..ef4e1f5d5 100644 --- a/relapack/src/zhetrf.c +++ b/relapack/src/zhetrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_zhetrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** ZHETRF computes the factorization of a complex Hermitian matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/d6/dd3/zhetrf_8f.html * */ void RELAPACK_zhetrf( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_zhetrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf( /** zhetrf's recursive compute kernel */ static void RELAPACK_zhetrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = ZREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_zhetrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_zhetrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = ZREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_zhetrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_zhetrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rec2.c b/relapack/src/zhetrf_rec2.c index c14cf0440..867ea64e1 100644 --- a/relapack/src/zhetrf_rec2.c +++ b/relapack/src/zhetrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static blasint c__1 = 1; +static int c__1 = 1; /** ZHETRF_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the Bunch-Kau fman diagonal pivoting method * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, - doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3, z__4; @@ -39,26 +39,26 @@ static blasint c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static blasint j, k; + static int j, k; static double t, r1; static doublecomplex d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - static blasint kstep; - extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, - doublecomplex *, doublecomplex *, blasint *, doublecomplex *, - blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), - zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, - blasint *), zswap_(int *, doublecomplex *, blasint *, - doublecomplex *, blasint *); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); static double absakk; - extern /* Subroutine */ blasint zdscal_(int *, double *, - doublecomplex *, blasint *); + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); static double colmax; - extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) ; - extern blasint izamax_(int *, doublecomplex *, blasint *); + extern int izamax_(int *, doublecomplex *, int *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zhetrf_rook.c b/relapack/src/zhetrf_rook.c index 285aea96e..15ceaeae7 100644 --- a/relapack/src/zhetrf_rook.c +++ b/relapack/src/zhetrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zhetrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_zhetrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** ZHETRF_ROOK computes the factorization of a complex Hermitian indefinite matrix using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zhetrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/d6/d6f/zhetrf__rook_8f.html * */ void RELAPACK_zhetrf_rook( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zhetrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZHETRF", &minfo, strlen("ZHETRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZHETRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_zhetrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_zhetrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zhetrf_rook( /** zhetrf_rook's recursive compute kernel */ static void RELAPACK_zhetrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZHETRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zhetrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = ZREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zhetrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zhetrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = ZREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_zhetrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zhetrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_zhetrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zhetrf_rook_rec2.c b/relapack/src/zhetrf_rook_rec2.c index e5033ad49..a56ad710b 100644 --- a/relapack/src/zhetrf_rook_rec2.c +++ b/relapack/src/zhetrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static blasint c__1 = 1; +static int c__1 = 1; /** ZHETRF_ROOK_REC2 computes a partial factorization of a complex Hermitian indefinite matrix using the boun ded Bunch-Kaufman ("rook") diagonal pivoting method * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * - ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zhetrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4, z__5; @@ -39,30 +39,30 @@ static blasint c__1 = 1; doublecomplex *, doublecomplex *); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static double t, r1; static doublecomplex d11, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - static blasint itemp, kstep; - extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, - doublecomplex *, doublecomplex *, blasint *, doublecomplex *, - blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), - zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, - blasint *), zswap_(int *, doublecomplex *, blasint *, - doublecomplex *, blasint *); + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); extern double dlamch_(char *, ftnlen); static double absakk; - extern /* Subroutine */ blasint zdscal_(int *, double *, - doublecomplex *, blasint *); + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); static double colmax; - extern /* Subroutine */ blasint zlacgv_(int *, doublecomplex *, blasint *) + extern /* Subroutine */ int zlacgv_(int *, doublecomplex *, int *) ; - extern blasint izamax_(int *, doublecomplex *, blasint *); + extern int izamax_(int *, doublecomplex *, int *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zlauum.c b/relapack/src/zlauum.c index 14fcd9213..490dcc82e 100644 --- a/relapack/src/zlauum.c +++ b/relapack/src/zlauum.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zlauum_rec(const char *, const blasint *, double *, - const blasint *, blasint *); +static void RELAPACK_zlauum_rec(const char *, const int *, double *, + const int *, int *); /** ZLAUUM computes the product U * U**H or L**H * L, where the triangular factor U or L is stored in the upper or lower triangular part of the array A. @@ -11,14 +11,14 @@ static void RELAPACK_zlauum_rec(const char *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/d8/d45/zlauum_8f.html * */ void RELAPACK_zlauum( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zlauum( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZLAUUM", &minfo, strlen("ZLAUUM")); + const int minfo = -*info; + LAPACK(xerbla)("ZLAUUM", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_zlauum( /** zlauum's recursive compute kernel */ static void RELAPACK_zlauum_rec( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_ZLAUUM, 1)) { @@ -57,8 +57,8 @@ static void RELAPACK_zlauum_rec( const double ONE[] = { 1., 0. }; // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zpbtrf.c b/relapack/src/zpbtrf.c index fb0e1e97b..37e711c9d 100644 --- a/relapack/src/zpbtrf.c +++ b/relapack/src/zpbtrf.c @@ -1,8 +1,8 @@ #include "relapack.h" #include "stdlib.h" -static void RELAPACK_zpbtrf_rec(const char *, const blasint *, const blasint *, - double *, const blasint *, double *, const blasint *, blasint *); +static void RELAPACK_zpbtrf_rec(const char *, const int *, const int *, + double *, const int *, double *, const int *, int *); /** ZPBTRF computes the Cholesky factorization of a complex Hermitian positive definite band matrix A. @@ -12,14 +12,14 @@ static void RELAPACK_zpbtrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/db/da9/zpbtrf_8f.html * */ void RELAPACK_zpbtrf( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -30,8 +30,8 @@ void RELAPACK_zpbtrf( else if (*ldAb < *kd + 1) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZPBTRF", &minfo, strlen("ZPBTRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZPBTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpbtrf( const double ZERO[] = { 0., 0. }; // Allocate work space - const blasint n1 = ZREC_SPLIT(*n); - const blasint mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; - const blasint nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; + const int n1 = ZREC_SPLIT(*n); + const int mWork = (*kd > n1) ? (lower ? *n - *kd : n1) : *kd; + const int nWork = (*kd > n1) ? (lower ? n1 : *n - *kd) : *kd; double *Work = malloc(mWork * nWork * 2 * sizeof(double)); LAPACK(zlaset)(uplo, &mWork, &nWork, ZERO, ZERO, Work, &mWork); @@ -58,10 +58,10 @@ void RELAPACK_zpbtrf( /** zpbtrf's recursive compute kernel */ static void RELAPACK_zpbtrf_rec( - const char *uplo, const blasint *n, const blasint *kd, - double *Ab, const blasint *ldAb, - double *Work, const blasint *ldWork, - blasint *info + const char *uplo, const int *n, const int *kd, + double *Ab, const int *ldAb, + double *Work, const int *ldWork, + int *info ){ if (*n <= MAX(CROSSOVER_ZPBTRF, 1)) { @@ -75,12 +75,12 @@ static void RELAPACK_zpbtrf_rec( const double MONE[] = { -1., 0. }; // Unskew A - const blasint ldA[] = { *ldAb - 1 }; + const int ldA[] = { *ldAb - 1 }; double *const A = Ab + 2 * ((*uplo == 'L') ? 0 : *kd); // Splitting - const blasint n1 = MIN(ZREC_SPLIT(*n), *kd); - const blasint n2 = *n - n1; + const int n1 = MIN(ZREC_SPLIT(*n), *kd); + const int n2 = *n - n1; // * * // * Ab_BR @@ -99,8 +99,8 @@ static void RELAPACK_zpbtrf_rec( return; // Banded splitting - const blasint n21 = MIN(n2, *kd - n1); - const blasint n22 = MIN(n2 - n21, *kd); + const int n21 = MIN(n2, *kd - n1); + const int n22 = MIN(n2 - n21, *kd); // n1 n21 n22 // n1 * A_TRl A_TRr diff --git a/relapack/src/zpotrf.c b/relapack/src/zpotrf.c index 9259279c1..411ac5fc0 100644 --- a/relapack/src/zpotrf.c +++ b/relapack/src/zpotrf.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_zpotrf_rec(const char *, const blasint *, double *, - const blasint *, blasint *); +static void RELAPACK_zpotrf_rec(const char *, const int *, double *, + const int *, int *); /** ZPOTRF computes the Cholesky factorization of a complex Hermitian positive definite matrix A. @@ -11,14 +11,14 @@ static void RELAPACK_zpotrf_rec(const char *, const blasint *, double *, * http://www.netlib.org/lapack/explore-html/d1/db9/zpotrf_8f.html * */ void RELAPACK_zpotrf( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -27,8 +27,8 @@ void RELAPACK_zpotrf( else if (*ldA < MAX(1, *n)) *info = -4; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZPOTRF", &minfo, strlen("ZPOTRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZPOTRF", &minfo); return; } @@ -42,9 +42,9 @@ void RELAPACK_zpotrf( /** zpotrf's recursive compute kernel */ static void RELAPACK_zpotrf_rec( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, + int *info ) { if (*n <= MAX(CROSSOVER_ZPOTRF, 1)) { @@ -58,8 +58,8 @@ static void RELAPACK_zpotrf_rec( const double MONE[] = { -1., 0. }; // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/relapack/src/zsytrf.c b/relapack/src/zsytrf.c index f3412ad8f..3be21563a 100644 --- a/relapack/src/zsytrf.c +++ b/relapack/src/zsytrf.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_zsytrf_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** ZSYTRF computes the factorization of a complex symmetric matrix A using the Bunch-Kaufman diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/da/d94/zsytrf_8f.html * */ void RELAPACK_zsytrf( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf( const char cleanuplo = lower ? 'L' : 'U'; // Dummy arguments - blasint nout; + int nout; // Recursive kernel RELAPACK_zsytrf_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf( /** zsytrf's recursive compute kernel */ static void RELAPACK_zsytrf_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF, 3)) { // Unblocked @@ -96,34 +96,34 @@ static void RELAPACK_zsytrf_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Loop iterator - blasint i; + int i; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = ZREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_zsytrf_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -139,23 +139,23 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_zsytrf_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -182,22 +182,22 @@ static void RELAPACK_zsytrf_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = ZREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_zsytrf_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -213,19 +213,19 @@ static void RELAPACK_zsytrf_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_zsytrf_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rec2.c b/relapack/src/zsytrf_rec2.c index ff17267c7..33902ee9e 100644 --- a/relapack/src/zsytrf_rec2.c +++ b/relapack/src/zsytrf_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static blasint c__1 = 1; +static int c__1 = 1; /** ZSYTRF_REC2 computes a partial factorization of a complex symmetric matrix using the Bunch-Kaufman diagon al pivoting method. * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, blasint *n, blasint * - nb, blasint *kb, doublecomplex *a, blasint *lda, blasint *ipiv, - doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rec2(char *uplo, int *n, int * + nb, int *kb, doublecomplex *a, int *lda, int *ipiv, + doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2, d__3, d__4; doublecomplex z__1, z__2, z__3; @@ -38,22 +38,22 @@ static blasint c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static blasint j, k; + static int j, k; static doublecomplex t, r1, d11, d21, d22; - static blasint jj, kk, jp, kp, kw, kkw, imax, jmax; + static int jj, kk, jp, kp, kw, kkw, imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); - extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, - doublecomplex *, blasint *); - static blasint kstep; - extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, - doublecomplex *, doublecomplex *, blasint *, doublecomplex *, - blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), - zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, - blasint *), zswap_(int *, doublecomplex *, blasint *, - doublecomplex *, blasint *); + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); static double absakk, colmax; - extern blasint izamax_(int *, doublecomplex *, blasint *); + extern int izamax_(int *, doublecomplex *, int *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/zsytrf_rook.c b/relapack/src/zsytrf_rook.c index fc6d73645..c598f7b1e 100644 --- a/relapack/src/zsytrf_rook.c +++ b/relapack/src/zsytrf_rook.c @@ -3,8 +3,8 @@ #include #endif -static void RELAPACK_zsytrf_rook_rec(const char *, const blasint *, const blasint *, blasint *, - double *, const blasint *, blasint *, double *, const blasint *, blasint *); +static void RELAPACK_zsytrf_rook_rec(const char *, const int *, const int *, int *, + double *, const int *, int *, double *, const int *, int *); /** ZSYTRF_ROOK computes the factorization of a complex symmetric matrix A using the bounded Bunch-Kaufman ("rook") diagonal pivoting method. @@ -14,21 +14,21 @@ static void RELAPACK_zsytrf_rook_rec(const char *, const blasint *, const blasin * http://www.netlib.org/lapack/explore-html/d6/d6e/zsytrf__rook_8f.html * */ void RELAPACK_zsytrf_rook( - const char *uplo, const blasint *n, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *lWork, blasint *info + const char *uplo, const int *n, + double *A, const int *ldA, int *ipiv, + double *Work, const int *lWork, int *info ) { // Required work size - const blasint cleanlWork = *n * (*n / 2); - blasint minlWork = cleanlWork; + const int cleanlWork = *n * (*n / 2); + int minlWork = cleanlWork; #if XSYTRF_ALLOW_MALLOC minlWork = 1; #endif // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -55,8 +55,8 @@ void RELAPACK_zsytrf_rook( #endif if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZSYTRF", &minfo, strlen("ZSYTRF")); + const int minfo = -*info; + LAPACK(xerbla)("ZSYTRF", &minfo); return; } @@ -64,7 +64,7 @@ void RELAPACK_zsytrf_rook( const char cleanuplo = lower ? 'L' : 'U'; // Dummy argument - blasint nout; + int nout; // Recursive kernel RELAPACK_zsytrf_rook_rec(&cleanuplo, n, n, &nout, A, ldA, ipiv, cleanWork, n, info); @@ -78,13 +78,13 @@ void RELAPACK_zsytrf_rook( /** zsytrf_rook's recursive compute kernel */ static void RELAPACK_zsytrf_rook_rec( - const char *uplo, const blasint *n_full, const blasint *n, blasint *n_out, - double *A, const blasint *ldA, blasint *ipiv, - double *Work, const blasint *ldWork, blasint *info + const char *uplo, const int *n_full, const int *n, int *n_out, + double *A, const int *ldA, int *ipiv, + double *Work, const int *ldWork, int *info ) { // top recursion level? - const blasint top = *n_full == *n; + const int top = *n_full == *n; if (*n <= MAX(CROSSOVER_ZSYTRF_ROOK, 3)) { // Unblocked @@ -96,31 +96,31 @@ static void RELAPACK_zsytrf_rook_rec( return; } - blasint info1, info2; + int info1, info2; // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; - const blasint n_rest = *n_full - *n; + const int n_rest = *n_full - *n; if (*uplo == 'L') { // Splitting (setup) - blasint n1 = ZREC_SPLIT(*n); - blasint n2 = *n - n1; + int n1 = ZREC_SPLIT(*n); + int n2 = *n - n1; // Work_L * double *const Work_L = Work; // recursion(A_L) - blasint n1_out; + int n1_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n1, &n1_out, A, ldA, ipiv, Work_L, ldWork, &info1); n1 = n1_out; // Splitting (continued) n2 = *n - n1; - const blasint n_full2 = *n_full - n1; + const int n_full2 = *n_full - n1; // * * // A_BL A_BR @@ -136,23 +136,23 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: use Work as Work_BR) double *const Work_BL = Work + 2 * n1; double *const Work_BR = top ? Work : Work + 2 * *ldWork * n1 + 2 * n1; - const blasint ldWork_BR = top ? n2 : *ldWork; + const int ldWork_BR = top ? n2 : *ldWork; // ipiv_T // ipiv_B - blasint *const ipiv_B = ipiv + n1; + int *const ipiv_B = ipiv + n1; // A_BR = A_BR - A_BL Work_BL' RELAPACK_zgemmt(uplo, "N", "T", &n2, &n1, MONE, A_BL, ldA, Work_BL, ldWork, ONE, A_BR, ldA); BLAS(zgemm)("N", "T", &n_rest, &n2, &n1, MONE, A_BL_B, ldA, Work_BL, ldWork, ONE, A_BR_B, ldA); // recursion(A_BR) - blasint n2_out; + int n2_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full2, &n2, &n2_out, A_BR, ldA, ipiv_B, Work_BR, &ldWork_BR, &info2); if (n2_out != n2) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // last column of A_BR double *const A_BR_r = A_BR + 2 * *ldA * n2_out + 2 * n2_out; @@ -169,7 +169,7 @@ static void RELAPACK_zsytrf_rook_rec( n2 = n2_out; // shift pivots - blasint i; + int i; for (i = 0; i < n2; i++) if (ipiv_B[i] > 0) ipiv_B[i] += n1; @@ -180,22 +180,22 @@ static void RELAPACK_zsytrf_rook_rec( *n_out = n1 + n2; } else { // Splitting (setup) - blasint n2 = ZREC_SPLIT(*n); - blasint n1 = *n - n2; + int n2 = ZREC_SPLIT(*n); + int n1 = *n - n2; // * Work_R // (top recursion level: use Work as Work_R) double *const Work_R = top ? Work : Work + 2 * *ldWork * n1; // recursion(A_R) - blasint n2_out; + int n2_out; RELAPACK_zsytrf_rook_rec(uplo, n_full, &n2, &n2_out, A, ldA, ipiv, Work_R, ldWork, &info2); - const blasint n2_diff = n2 - n2_out; + const int n2_diff = n2 - n2_out; n2 = n2_out; // Splitting (continued) n1 = *n - n2; - const blasint n_full1 = *n_full - n2; + const int n_full1 = *n_full - n2; // * A_TL_T A_TR_T // * A_TL A_TR @@ -211,19 +211,19 @@ static void RELAPACK_zsytrf_rook_rec( // (top recursion level: Work_R was Work) double *const Work_L = Work; double *const Work_TR = Work + 2 * *ldWork * (top ? n2_diff : n1) + 2 * n_rest; - const blasint ldWork_L = top ? n1 : *ldWork; + const int ldWork_L = top ? n1 : *ldWork; // A_TL = A_TL - A_TR Work_TR' RELAPACK_zgemmt(uplo, "N", "T", &n1, &n2, MONE, A_TR, ldA, Work_TR, ldWork, ONE, A_TL, ldA); BLAS(zgemm)("N", "T", &n_rest, &n1, &n2, MONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, ldA); // recursion(A_TL) - blasint n1_out; + int n1_out; RELAPACK_zsytrf_rook_rec(uplo, &n_full1, &n1, &n1_out, A, ldA, ipiv, Work_L, &ldWork_L, &info1); if (n1_out != n1) { // undo 1 column of updates - const blasint n_restp1 = n_rest + 1; + const int n_restp1 = n_rest + 1; // A_TL_T_l = A_TL_T_l + A_TR_T Work_TR_t' BLAS(zgemv)("N", &n_restp1, &n2, ONE, A_TR_T, ldA, Work_TR, ldWork, ONE, A_TL_T, iONE); diff --git a/relapack/src/zsytrf_rook_rec2.c b/relapack/src/zsytrf_rook_rec2.c index 4dbf8733a..9e111fe0c 100644 --- a/relapack/src/zsytrf_rook_rec2.c +++ b/relapack/src/zsytrf_rook_rec2.c @@ -15,7 +15,7 @@ /* Table of constant values */ static doublecomplex c_b1 = {1.,0.}; -static blasint c__1 = 1; +static int c__1 = 1; /** ZSYTRF_ROOK_REC2 computes a partial factorization of a complex symmetric matrix using the bounded Bunch-K aufman ("rook") diagonal pivoting method. * @@ -24,12 +24,12 @@ static blasint c__1 = 1; * The blocked BLAS Level 3 updates were removed and moved to the * recursive algorithm. * */ -/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, blasint *n, - int *nb, blasint *kb, doublecomplex *a, blasint *lda, blasint * - ipiv, doublecomplex *w, blasint *ldw, blasint *info, ftnlen uplo_len) +/* Subroutine */ void RELAPACK_zsytrf_rook_rec2(char *uplo, int *n, + int *nb, int *kb, doublecomplex *a, int *lda, int * + ipiv, doublecomplex *w, int *ldw, int *info, ftnlen uplo_len) { /* System generated locals */ - blasint a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; + int a_dim1, a_offset, w_dim1, w_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -38,26 +38,26 @@ static blasint c__1 = 1; void z_div(doublecomplex *, doublecomplex *, doublecomplex *); /* Local variables */ - static blasint j, k, p; + static int j, k, p; static doublecomplex t, r1, d11, d12, d21, d22; - static blasint ii, jj, kk, kp, kw, jp1, jp2, kkw; + static int ii, jj, kk, kp, kw, jp1, jp2, kkw; static logical done; - static blasint imax, jmax; + static int imax, jmax; static double alpha; extern logical lsame_(char *, char *, ftnlen, ftnlen); static double dtemp, sfmin; - extern /* Subroutine */ blasint zscal_(int *, doublecomplex *, - doublecomplex *, blasint *); - static blasint itemp, kstep; - extern /* Subroutine */ blasint zgemv_(char *, blasint *, blasint *, - doublecomplex *, doublecomplex *, blasint *, doublecomplex *, - blasint *, doublecomplex *, doublecomplex *, blasint *, ftnlen), - zcopy_(int *, doublecomplex *, blasint *, doublecomplex *, - blasint *), zswap_(int *, doublecomplex *, blasint *, - doublecomplex *, blasint *); + extern /* Subroutine */ int zscal_(int *, doublecomplex *, + doublecomplex *, int *); + static int itemp, kstep; + extern /* Subroutine */ int zgemv_(char *, int *, int *, + doublecomplex *, doublecomplex *, int *, doublecomplex *, + int *, doublecomplex *, doublecomplex *, int *, ftnlen), + zcopy_(int *, doublecomplex *, int *, doublecomplex *, + int *), zswap_(int *, doublecomplex *, int *, + doublecomplex *, int *); extern double dlamch_(char *, ftnlen); static double absakk, colmax; - extern blasint izamax_(int *, doublecomplex *, blasint *); + extern int izamax_(int *, doublecomplex *, int *); static double rowmax; /* Parameter adjustments */ diff --git a/relapack/src/ztgsyl.c b/relapack/src/ztgsyl.c index 6a41475e8..2c8a35256 100644 --- a/relapack/src/ztgsyl.c +++ b/relapack/src/ztgsyl.c @@ -1,10 +1,10 @@ #include "relapack.h" #include -static void RELAPACK_ztgsyl_rec(const char *, const blasint *, const blasint *, - const blasint *, const double *, const blasint *, const double *, const blasint *, - double *, const blasint *, const double *, const blasint *, const double *, - const blasint *, double *, const blasint *, double *, double *, double *, blasint *); +static void RELAPACK_ztgsyl_rec(const char *, const int *, const int *, + const int *, const double *, const int *, const double *, const int *, + double *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, double *, double *, int *); /** ZTGSYL solves the generalized Sylvester equation. @@ -14,21 +14,21 @@ static void RELAPACK_ztgsyl_rec(const char *, const blasint *, const blasint *, * http://www.netlib.org/lapack/explore-html/db/d68/ztgsyl_8f.html * */ void RELAPACK_ztgsyl( - const char *trans, const blasint *ijob, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ijob, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dif, - double *Work, const blasint *lWork, blasint *iWork, blasint *info + double *Work, const int *lWork, int *iWork, int *info ) { // Parse arguments - const blasint notran = LAPACK(lsame)(trans, "N"); - const blasint tran = LAPACK(lsame)(trans, "C"); + const int notran = LAPACK(lsame)(trans, "N"); + const int tran = LAPACK(lsame)(trans, "C"); // Compute work buffer size - blasint lwmin = 1; + int lwmin = 1; if (notran && (*ijob == 1 || *ijob == 2)) lwmin = MAX(1, 2 * *m * *n); *info = 0; @@ -57,8 +57,8 @@ void RELAPACK_ztgsyl( else if (*lWork < lwmin && *lWork != -1) *info = -20; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZTGSYL", &minfo, strlen("ZTGSYL")); + const int minfo = -*info; + LAPACK(xerbla)("ZTGSYL", &minfo); return; } @@ -74,8 +74,8 @@ void RELAPACK_ztgsyl( // Constant const double ZERO[] = { 0., 0. }; - blasint isolve = 1; - blasint ifunc = 0; + int isolve = 1; + int ifunc = 0; if (notran) { if (*ijob >= 3) { ifunc = *ijob - 2; @@ -86,7 +86,7 @@ void RELAPACK_ztgsyl( } double scale2; - blasint iround; + int iround; for (iround = 1; iround <= isolve; iround++) { *scale = 1; double dscale = 0; @@ -119,13 +119,13 @@ void RELAPACK_ztgsyl( /** ztgsyl's recursive vompute kernel */ static void RELAPACK_ztgsyl_rec( - const char *trans, const blasint *ifunc, const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, - const double *D, const blasint *ldD, const double *E, const blasint *ldE, - double *F, const blasint *ldF, + const char *trans, const int *ifunc, const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, + const double *D, const int *ldD, const double *E, const int *ldE, + double *F, const int *ldF, double *scale, double *dsum, double *dscale, - blasint *info + int *info ) { if (*m <= MAX(CROSSOVER_ZTGSYL, 1) && *n <= MAX(CROSSOVER_ZTGSYL, 1)) { @@ -137,18 +137,18 @@ static void RELAPACK_ztgsyl_rec( // Constants const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - const blasint m1 = ZREC_SPLIT(*m); - const blasint m2 = *m - m1; + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -206,8 +206,8 @@ static void RELAPACK_ztgsyl_rec( } } else { // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl.c b/relapack/src/ztrsyl.c index 567ef115a..82b2c8803 100644 --- a/relapack/src/ztrsyl.c +++ b/relapack/src/ztrsyl.c @@ -1,8 +1,8 @@ #include "relapack.h" -static void RELAPACK_ztrsyl_rec(const char *, const char *, const blasint *, - const blasint *, const blasint *, const double *, const blasint *, const double *, - const blasint *, double *, const blasint *, double *, blasint *); +static void RELAPACK_ztrsyl_rec(const char *, const char *, const int *, + const int *, const int *, const double *, const int *, const double *, + const int *, double *, const int *, double *, int *); /** ZTRSYL solves the complex Sylvester matrix equation. @@ -12,18 +12,18 @@ static void RELAPACK_ztrsyl_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d1/d36/ztrsyl_8f.html * */ void RELAPACK_ztrsyl( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { // Check arguments - const blasint notransA = LAPACK(lsame)(tranA, "N"); - const blasint ctransA = LAPACK(lsame)(tranA, "C"); - const blasint notransB = LAPACK(lsame)(tranB, "N"); - const blasint ctransB = LAPACK(lsame)(tranB, "C"); + const int notransA = LAPACK(lsame)(tranA, "N"); + const int ctransA = LAPACK(lsame)(tranA, "C"); + const int notransB = LAPACK(lsame)(tranB, "N"); + const int ctransB = LAPACK(lsame)(tranB, "C"); *info = 0; if (!ctransA && !notransA) *info = -1; @@ -42,8 +42,8 @@ void RELAPACK_ztrsyl( else if (*ldC < MAX(1, *m)) *info = -11; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZTRSYL", &minfo, strlen("ZTRSYL")); + const int minfo = -*info; + LAPACK(xerbla)("ZTRSYL", &minfo); return; } @@ -58,11 +58,11 @@ void RELAPACK_ztrsyl( /** ztrsyl's recursive compute kernel */ static void RELAPACK_ztrsyl_rec( - const char *tranA, const char *tranB, const blasint *isgn, - const blasint *m, const blasint *n, - const double *A, const blasint *ldA, const double *B, const blasint *ldB, - double *C, const blasint *ldC, double *scale, - blasint *info + const char *tranA, const char *tranB, const int *isgn, + const int *m, const int *n, + const double *A, const int *ldA, const double *B, const int *ldB, + double *C, const int *ldC, double *scale, + int *info ) { if (*m <= MAX(CROSSOVER_ZTRSYL, 1) && *n <= MAX(CROSSOVER_ZTRSYL, 1)) { @@ -75,18 +75,18 @@ static void RELAPACK_ztrsyl_rec( const double ONE[] = { 1., 0. }; const double MONE[] = { -1., 0. }; const double MSGN[] = { -*isgn, 0. }; - const blasint iONE[] = { 1 }; + const int iONE[] = { 1 }; // Outputs double scale1[] = { 1., 0. }; double scale2[] = { 1., 0. }; - blasint info1[] = { 0 }; - blasint info2[] = { 0 }; + int info1[] = { 0 }; + int info2[] = { 0 }; if (*m > *n) { // Splitting - const blasint m1 = ZREC_SPLIT(*m); - const blasint m2 = *m - m1; + const int m1 = ZREC_SPLIT(*m); + const int m2 = *m - m1; // A_TL A_TR // 0 A_BR @@ -122,8 +122,8 @@ static void RELAPACK_ztrsyl_rec( } } else { // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // B_TL B_TR // 0 B_BR diff --git a/relapack/src/ztrsyl_rec2.c b/relapack/src/ztrsyl_rec2.c index edc6ffc6b..526ab097c 100644 --- a/relapack/src/ztrsyl_rec2.c +++ b/relapack/src/ztrsyl_rec2.c @@ -14,16 +14,16 @@ #include "f2c.h" #if BLAS_COMPLEX_FUNCTIONS_AS_ROUTINES -doublecomplex zdotu_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { - extern void zdotu_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); +doublecomplex zdotu_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotu_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); doublecomplex result; zdotu_(&result, n, x, incx, y, incy); return result; } #define zdotu_ zdotu_fun -doublecomplex zdotc_fun(int *n, doublecomplex *x, blasint *incx, doublecomplex *y, blasint *incy) { - extern void zdotc_(doublecomplex *, blasint *, doublecomplex *, blasint *, doublecomplex *, blasint *); +doublecomplex zdotc_fun(int *n, doublecomplex *x, int *incx, doublecomplex *y, int *incy) { + extern void zdotc_(doublecomplex *, int *, doublecomplex *, int *, doublecomplex *, int *); doublecomplex result; zdotc_(&result, n, x, incx, y, incy); return result; @@ -43,7 +43,7 @@ doublecomplex zladiv_fun(doublecomplex *a, doublecomplex *b) { /* Table of constant values */ -static blasint c__1 = 1; +static int c__1 = 1; /** RELAPACK_ZTRSYL_REC2 solves the complex Sylvester matrix equation (unblocked algorithm) * @@ -51,12 +51,12 @@ static blasint c__1 = 1; * It serves as an unblocked kernel in the recursive algorithms. * */ /* Subroutine */ void RELAPACK_ztrsyl_rec2(char *trana, char *tranb, int - *isgn, blasint *m, blasint *n, doublecomplex *a, blasint *lda, - doublecomplex *b, blasint *ldb, doublecomplex *c__, blasint *ldc, - double *scale, blasint *info, ftnlen trana_len, ftnlen tranb_len) + *isgn, int *m, int *n, doublecomplex *a, int *lda, + doublecomplex *b, int *ldb, doublecomplex *c__, int *ldc, + double *scale, int *info, ftnlen trana_len, ftnlen tranb_len) { /* System generated locals */ - blasint a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, + int a_dim1, a_offset, b_dim1, b_offset, c_dim1, c_offset, i__1, i__2, i__3, i__4; double d__1, d__2; doublecomplex z__1, z__2, z__3, z__4; @@ -66,7 +66,7 @@ static blasint c__1 = 1; void d_cnjg(doublecomplex *, doublecomplex *); /* Local variables */ - static blasint j, k, l; + static int j, k, l; static doublecomplex a11; static double db; static doublecomplex x11; @@ -74,23 +74,23 @@ static blasint c__1 = 1; static doublecomplex vec; static double dum[1], eps, sgn, smin; static doublecomplex suml, sumr; - extern blasint lsame_(char *, char *, ftnlen, ftnlen); + extern int lsame_(char *, char *, ftnlen, ftnlen); /* Double Complex */ doublecomplex zdotc_(int *, - doublecomplex *, blasint *, doublecomplex *, blasint *), zdotu_( - blasint *, doublecomplex *, blasint *, - doublecomplex *, blasint *); - extern /* Subroutine */ blasint dlabad_(double *, double *); + doublecomplex *, int *, doublecomplex *, int *), zdotu_( + int *, doublecomplex *, int *, + doublecomplex *, int *); + extern /* Subroutine */ int dlabad_(double *, double *); extern double dlamch_(char *, ftnlen); static double scaloc; - extern /* Subroutine */ blasint xerbla_(char *, blasint *, ftnlen); - extern double zlange_(char *, blasint *, blasint *, doublecomplex *, - blasint *, double *, ftnlen); + extern /* Subroutine */ int xerbla_(char *, int *, ftnlen); + extern double zlange_(char *, int *, int *, doublecomplex *, + int *, double *, ftnlen); static double bignum; - extern /* Subroutine */ blasint zdscal_(int *, double *, - doublecomplex *, blasint *); + extern /* Subroutine */ int zdscal_(int *, double *, + doublecomplex *, int *); /* Double Complex */ doublecomplex zladiv_(doublecomplex *, doublecomplex *); - static blasint notrna, notrnb; + static int notrna, notrnb; static double smlnum; /* Parameter adjustments */ diff --git a/relapack/src/ztrtri.c b/relapack/src/ztrtri.c index 3f6606d84..ac9fe7bd4 100644 --- a/relapack/src/ztrtri.c +++ b/relapack/src/ztrtri.c @@ -1,7 +1,7 @@ #include "relapack.h" -static void RELAPACK_ztrtri_rec(const char *, const char *, const blasint *, - double *, const blasint *, blasint *); +static void RELAPACK_ztrtri_rec(const char *, const char *, const int *, + double *, const int *, int *); /** CTRTRI computes the inverse of a complex upper or lower triangular matrix A. @@ -11,16 +11,16 @@ static void RELAPACK_ztrtri_rec(const char *, const char *, const blasint *, * http://www.netlib.org/lapack/explore-html/d1/d0e/ztrtri_8f.html * */ void RELAPACK_ztrtri( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ) { // Check arguments - const blasint lower = LAPACK(lsame)(uplo, "L"); - const blasint upper = LAPACK(lsame)(uplo, "U"); - const blasint nounit = LAPACK(lsame)(diag, "N"); - const blasint unit = LAPACK(lsame)(diag, "U"); + const int lower = LAPACK(lsame)(uplo, "L"); + const int upper = LAPACK(lsame)(uplo, "U"); + const int nounit = LAPACK(lsame)(diag, "N"); + const int unit = LAPACK(lsame)(diag, "U"); *info = 0; if (!lower && !upper) *info = -1; @@ -31,8 +31,8 @@ void RELAPACK_ztrtri( else if (*ldA < MAX(1, *n)) *info = -5; if (*info) { - const blasint minfo = -*info; - LAPACK(xerbla)("ZTRTRI", &minfo, strlen("ZTRTRI")); + const int minfo = -*info; + LAPACK(xerbla)("ZTRTRI", &minfo); return; } @@ -42,7 +42,7 @@ void RELAPACK_ztrtri( // check for singularity if (nounit) { - blasint i; + int i; for (i = 0; i < *n; i++) if (A[2 * (i + *ldA * i)] == 0 && A[2 * (i + *ldA * i) + 1] == 0) { *info = i; @@ -57,9 +57,9 @@ void RELAPACK_ztrtri( /** ztrtri's recursive compute kernel */ static void RELAPACK_ztrtri_rec( - const char *uplo, const char *diag, const blasint *n, - double *A, const blasint *ldA, - blasint *info + const char *uplo, const char *diag, const int *n, + double *A, const int *ldA, + int *info ){ if (*n <= MAX(CROSSOVER_ZTRTRI, 1)) { @@ -73,8 +73,8 @@ static void RELAPACK_ztrtri_rec( const double MONE[] = { -1. }; // Splitting - const blasint n1 = ZREC_SPLIT(*n); - const blasint n2 = *n - n1; + const int n1 = ZREC_SPLIT(*n); + const int n2 = *n - n1; // A_TL A_TR // A_BL A_BR diff --git a/test/cblat1.f b/test/cblat1.f index d6b53d105..a4c996fda 100644 --- a/test/cblat1.f +++ b/test/cblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/dblat1.f b/test/dblat1.f index 28af121cd..f3255fef4 100644 --- a/test/dblat1.f +++ b/test/dblat1.f @@ -991,7 +991,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/sblat1.f b/test/sblat1.f index fe05bbe87..a5c1c6af6 100644 --- a/test/sblat1.f +++ b/test/sblat1.f @@ -946,7 +946,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/test/zblat1.f b/test/zblat1.f index 8b4b8d21e..e2415e1c4 100644 --- a/test/zblat1.f +++ b/test/zblat1.f @@ -576,7 +576,7 @@ SUBROUTINE STEST1(SCOMP1,STRUE1,SSIZE,SFAC) * ************************* STEST1 ***************************** * -* THIS IS AN INTERFACE SUBROUTINE TO ACCOMMODATE THE FORTRAN +* THIS IS AN INTERFACE SUBROUTINE TO ACCOMODATE THE FORTRAN * REQUIREMENT THAT WHEN A DUMMY ARGUMENT IS AN ARRAY, THE * ACTUAL ARGUMENT MUST ALSO BE AN ARRAY OR AN ARRAY ELEMENT. * diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index 4e647cadc..1b426afe7 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -38,7 +38,6 @@ if (NOT NO_LAPACK) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_potrs.c - test_kernel_regress.c ) endif() @@ -62,7 +61,7 @@ foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) set_target_properties( ${OpenBLAS_utest_bin} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${CMAKE_CURRENT_BINARY_DIR}) endforeach() -if (MSVC AND BUILD_SHARED_LIBS) +if (MSVC) add_custom_command(TARGET ${OpenBLAS_utest_bin} POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_BINARY_DIR}/lib/${CMAKE_CFG_INTDIR}/${OpenBLAS_LIBNAME}.dll ${CMAKE_CURRENT_BINARY_DIR}/. diff --git a/utest/Makefile b/utest/Makefile index cbe639cdb..e40b3c6db 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -13,7 +13,6 @@ OBJS=utest_main.o test_amax.o test_rotmg.o test_axpy.o test_dotu.o test_dsdot.o ifneq ($(NO_LAPACK), 1) OBJS += test_potrs.o -OBJS += test_kernel_regress.o endif #this does not work with OpenMP nor with native Windows or Android threads @@ -38,3 +37,4 @@ clean: -rm -f *.o $(UTESTBIN) libs: + diff --git a/utest/ctest.h b/utest/ctest.h index d316b1494..f297dafba 100644 --- a/utest/ctest.h +++ b/utest/ctest.h @@ -83,6 +83,10 @@ struct ctest { #undef CTEST_SEGFAULT #endif +#if _MSC_VER < 1900 +#define snprintf _snprintf +#endif + #ifndef __cplusplus #define inline __inline #endif diff --git a/utest/test_kernel_regress.c b/utest/test_kernel_regress.c deleted file mode 100644 index 93a30b30c..000000000 --- a/utest/test_kernel_regress.c +++ /dev/null @@ -1,50 +0,0 @@ -#include "openblas_utest.h" -#include -#include -#include - -#define LAPACK_ROW_MAJOR 101 -blasint LAPACKE_dgesvd( blasint matrix_layout, char jobu, char jobvt, - blasint m, blasint n, double* a, - blasint lda, double* s, double* u, blasint ldu, - double* vt, blasint ldvt, double* superb ); - - -#define DATASIZE 100 - -double s[DATASIZE]; -double u[DATASIZE*DATASIZE]; -double vt[DATASIZE*DATASIZE]; -double X[DATASIZE*DATASIZE]; -double superb[DATASIZE]; -double tmp[DATASIZE*DATASIZE]; -double m[DATASIZE*DATASIZE]; - -CTEST(kernel_regress,skx_avx) -{ - double norm; - int i, j, info; - srand(0); - for (i = 0; i < DATASIZE*DATASIZE; i++) { - m[i] = (rand()+0.0)/RAND_MAX * 10; - tmp[i] = m[i]; - } - - info = LAPACKE_dgesvd( LAPACK_ROW_MAJOR, 'A', 'A', DATASIZE, DATASIZE, m, DATASIZE, - s, u, DATASIZE, vt, DATASIZE, superb); - - for (i = 0; i < DATASIZE; i++) { - for (j = 0; j < DATASIZE; j++) { - u[i*DATASIZE+j] = u[i*DATASIZE+j]*s[j]; - } - } - cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, - DATASIZE, DATASIZE, DATASIZE, 1, u, DATASIZE, vt, DATASIZE, 0, X, DATASIZE); - - for (i = 0; i < DATASIZE*DATASIZE; i++) { - X[i] = X[i] - tmp[i]; - } - - norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); - ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); -}