Merge develop in preparation of 0.3.6 releasetags/v0.3.6
| @@ -149,7 +149,7 @@ matrix: | |||
| - &test-macos | |||
| os: osx | |||
| osx_image: xcode8 | |||
| osx_image: xcode10.1 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - brew update | |||
| @@ -160,6 +160,7 @@ matrix: | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-macos | |||
| osx_image: xcode8.3 | |||
| env: | |||
| - BTYPE="BINARY=32" | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 5) | |||
| set(OpenBLAS_PATCH_VERSION 6) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -42,6 +42,19 @@ endif() | |||
| ####### | |||
| if(MSVC AND MSVC_STATIC_CRT) | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| @@ -62,10 +75,10 @@ endif () | |||
| set(SUBDIRS ${BLASDIRS}) | |||
| if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| if(BUILD_RELAPACK) | |||
| list(APPEND SUBDIRS relapack/src) | |||
| endif() | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| # set which float types we want to build for | |||
| @@ -134,7 +147,7 @@ endif () | |||
| # Only generate .def for dll on MSVC and always produce pdb files for debug and release | |||
| if(MSVC) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
| set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | |||
| endif() | |||
| set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | |||
| @@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH}) | |||
| endforeach() | |||
| endif () | |||
| # Only build shared libs for MSVC | |||
| if (MSVC) | |||
| set(BUILD_SHARED_LIBS ON) | |||
| endif() | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| @@ -166,7 +173,7 @@ endif() | |||
| # Handle MSVC exports | |||
| if(MSVC AND BUILD_SHARED_LIBS) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
| else() | |||
| # Creates verbose .def file (51KB vs 18KB) | |||
| @@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| @@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| if(NOT NOFORTRAN) | |||
| message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) | |||
| set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) | |||
| file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") | |||
| file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") | |||
| file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") | |||
| @@ -327,10 +342,11 @@ endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| @@ -1,4 +1,82 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.6 | |||
| 29-Apr-2019 | |||
| common: | |||
| * the build tools now check that a given cpu TARGET is actually valid | |||
| * the build-time check of system features (c_check) has been made | |||
| less dependent on particular perl features (this should mainly | |||
| benefit building on Windows) | |||
| * several problem with the ReLAPACK integration were fixed, | |||
| including INTERFACE64 support and building a shared library | |||
| * building with CMAKE on BSD systems was improved | |||
| * a non-absolute SUM function was added based on the | |||
| existing optimized code for ASUM | |||
| * CBLAS interfaces to the IxMIN and IxMAX functions were added | |||
| * a name clash between LAPACKE and BOOST headers was resolved | |||
| * CMAKE builds with OpenMP failed to include the appropriate getrf_parallel | |||
| kernels | |||
| * a crash on thread (key) deletion with the USE_TLS=1 memory management | |||
| option was fixed | |||
| * restored several earlier fixes, in particular for OpenMP performance, | |||
| building on BSD, and calling fork on CYGWIN, which had inadvertently | |||
| been dropped in the 0.3.3 rewrite of the memory management code. | |||
| x86_64: | |||
| * the AVX512 DGEMM kernel has been disabled again due to unsolved problems | |||
| * building with old versions of MSVC was fixed | |||
| * it is now possible to build a static library on Windows with CMAKE | |||
| * accessing environment variables on CYGWIN at run time was fixed | |||
| * the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
| * Intel "Denverton" atom and Hygon "Dhyana" zen CPUs are now autodetected | |||
| * building for DYNAMIC_ARCH with a DYNAMIC_LIST of targets is now supported | |||
| with CMAKE as well | |||
| * building for DYNAMIC_ARCH with GENERIC as the default target is now supported | |||
| * a buffer overflow in the SSE GEMM kernel for Intel Nano targets was fixed | |||
| * assembly bugs involving undeclared modification of input operands were fixed | |||
| in the AXPY, DOT, GEMV, GER, SCAL, SYMV and TRSM microkernels for Nehalem, | |||
| Sandybridge, Haswell, Bulldozer and Piledriver. These would typically cause | |||
| test failures or segfaults when compiled with recent versions of gcc from 8 onward. | |||
| * a similar bug was fixed in the blas_quickdivide code used to split workloads | |||
| in most functions | |||
| * a bug in the IxMIN implementation for the GENERIC target made it return the result of IxMAX | |||
| * fixed building on SkylakeX systems when either the compiler or the (emulated) operating | |||
| environment does not support AVX512 | |||
| * improved GEMM performance on ZEN targets | |||
| x86: | |||
| * build failures caused by the recently added checks for AVX512 were fixed | |||
| * an inline assembly bug involving undeclared modification of an input argument was | |||
| fixed in the blas_quickdivide code used to split workloads in most functions | |||
| * a bug in the IMIN implementation for the GENERIC target made it return the result of IMAX | |||
| MIPS32: | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| POWER: | |||
| * single precision BLAS1/2 functions have received optimized POWER8 kernels | |||
| * POWER9 is now a separate target, with an optimized DGEMM/DTRMM kernel | |||
| * building on PPC970 systems under OSX Leopard or Tiger is now supported | |||
| * out-of-bounds memory accesses in the gemm_beta microkernels were fixed | |||
| * building a shared library on AIX is now supported for POWER6 | |||
| * DYNAMIC_ARCH support has been added for POWER6 and newer | |||
| ARMv7: | |||
| * corrected xDOT behaviour with zero INC_X or INC_Y | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| ARMv8: | |||
| * added support for HiSilicon TSV110 cpus | |||
| * the CMAKE build system now recognizes 32bit userspace on 64bit hardware | |||
| * cross-compilation with CMAKE now works again | |||
| * a bug in the IMIN implementation made it return the result of IMAX | |||
| * ARMV8 builds with the BINARY=32 option are now automatically handled as ARMV7 | |||
| IBM Z: | |||
| * optimized microkernels for single precicion BLAS1/2 functions have been added | |||
| for both Z13 and Z14 | |||
| ==================================================================== | |||
| Version 0.3.5 | |||
| 31-Dec-2018 | |||
| @@ -96,7 +96,7 @@ endif | |||
| @echo | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED), 1) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| @@ -58,14 +58,14 @@ ifndef NO_LAPACKE | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @@ -106,14 +106,14 @@ ifndef NO_LAPACKE | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| @@ -138,7 +138,7 @@ endif | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @@ -9,7 +9,15 @@ else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.5 | |||
| VERSION = 0.3.6 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -48,6 +48,8 @@ VERSION = 0.3.5 | |||
| # HOSTCC = gcc | |||
| # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | |||
| # Please note that AVX is not available on 32-bit. | |||
| # Setting BINARY=32 disables AVX/AVX2/AVX-512. | |||
| # BINARY=64 | |||
| # About threaded BLAS. It will be automatically detected if you don't | |||
| @@ -57,7 +59,7 @@ VERSION = 0.3.5 | |||
| # USE_THREAD = 0 | |||
| # If you're going to use this library with OpenMP, please comment it in. | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | |||
| # USE_OPENMP = 1 | |||
| # The OpenMP scheduler to use - by default this is "static" and you | |||
| @@ -68,36 +70,45 @@ VERSION = 0.3.5 | |||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
| # You can define maximum number of threads. Basically it should be | |||
| # less than actual number of cores. If you don't specify one, it's | |||
| # automatically detected by the the script. | |||
| # You can define the maximum number of threads. Basically it should be less | |||
| # than or equal to the number of CPU threads. If you don't specify one, it's | |||
| # automatically detected by the build system. | |||
| # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to | |||
| # restrict NUM_THREADS to the number of physical cores. By default, the automatic | |||
| # detection includes logical CPUs, thus allowing the use of SMT. | |||
| # Users may opt at runtime to use less than NUM_THREADS threads. | |||
| # | |||
| # Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS | |||
| # value (eg. 32-256) if you expect your users to use that many threads. Due to the way | |||
| # some internal structures are allocated, using a large NUM_THREADS value has a RAM | |||
| # footprint penalty, even if users reduce the actual number of threads at runtime. | |||
| # NUM_THREADS = 24 | |||
| # If you have enabled USE_OPENMP and your application would call | |||
| # OpenBLAS's calculation API from multi threads, please comment it in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can | |||
| # actually run in parallel. If more threads call OpenBLAS's calculation API, | |||
| # OpenBLAS's calculation API from multiple threads, please comment this in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can actually | |||
| # run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, | |||
| # they need to wait for the preceding API calls to finish or risk data corruption. | |||
| # NUM_PARALLEL = 2 | |||
| # if you don't need to install the static library, please comment it in. | |||
| # If you don't need to install the static library, please comment this in. | |||
| # NO_STATIC = 1 | |||
| # if you don't need generate the shared library, please comment it in. | |||
| # If you don't need to generate the shared library, please comment this in. | |||
| # NO_SHARED = 1 | |||
| # If you don't need CBLAS interface, please comment it in. | |||
| # If you don't need the CBLAS interface, please comment this in. | |||
| # NO_CBLAS = 1 | |||
| # If you only want CBLAS interface without installing Fortran compiler, | |||
| # please comment it in. | |||
| # If you only want the CBLAS interface without installing a Fortran compiler, | |||
| # please comment this in. | |||
| # ONLY_CBLAS = 1 | |||
| # If you don't need LAPACK, please comment it in. | |||
| # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||
| # If you don't need LAPACK, please comment this in. | |||
| # If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. | |||
| # NO_LAPACK = 1 | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment this in. | |||
| # NO_LAPACKE = 1 | |||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
| @@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # Build RecursiveLAPACK on top of LAPACK | |||
| # BUILD_RELAPACK = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # If you want to use the legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # If you want to use the new, still somewhat experimental code that uses | |||
| @@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # USE_TLS = 1 | |||
| # If you want to drive whole 64bit region by BLAS. Not all Fortran | |||
| # compiler supports this. It's safe to keep comment it out if you | |||
| # are not sure(equivalent to "-i8" option). | |||
| # compilers support this. It's safe to keep this commented out if you | |||
| # are not sure. (This is equivalent to the "-i8" ifort option). | |||
| # INTERFACE64 = 1 | |||
| # Unfortunately most of kernel won't give us high quality buffer. | |||
| @@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # but it will consume time. If you don't like it, you can disable one. | |||
| NO_WARMUP = 1 | |||
| # If you want to disable CPU/Memory affinity on Linux. | |||
| # Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. | |||
| # This feature is only implemented on Linux, and is always disabled on other platforms. | |||
| # Enabling affinity handling may improve performance, especially on NUMA systems, but | |||
| # it may conflict with certain applications that also try to manage affinity. | |||
| # This conflict can result in threads of the application calling OpenBLAS ending up locked | |||
| # to the same core(s) as OpenBLAS, possibly binding all threads to a single core. | |||
| # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing | |||
| # else modifies affinity settings. | |||
| # Note: enabling affinity has been known to cause problems with NumPy and R | |||
| NO_AFFINITY = 1 | |||
| # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # BIGNUMA = 1 | |||
| # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | |||
| @@ -180,7 +199,7 @@ NO_AFFINITY = 1 | |||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| # If you need sanity check by comparing results to reference BLAS. It'll be very | |||
| # slow (Not implemented yet). | |||
| # SANITY_CHECK = 1 | |||
| @@ -65,6 +65,7 @@ endif | |||
| ifdef TARGET | |||
| GETARCH_FLAGS := -DFORCE_$(TARGET) | |||
| GETARCH_FLAGS += -DUSER_TARGET | |||
| endif | |||
| # Force fallbacks for 32bit | |||
| @@ -94,6 +95,9 @@ endif | |||
| ifeq ($(TARGET), ZEN) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET), ARMV8) | |||
| GETARCH_FLAGS := -DFORCE_ARMV7 | |||
| endif | |||
| endif | |||
| @@ -151,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX | |||
| endif | |||
| ifeq ($(BINARY), 32) | |||
| GETARCH_FLAGS += -DNO_AVX | |||
| GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 | |||
| NO_AVX512 = 1 | |||
| endif | |||
| ifeq ($(NO_AVX2), 1) | |||
| @@ -523,6 +528,12 @@ DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| DYNAMIC_CORE += POWER9 | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| ifndef DYNAMIC_CORE | |||
| override DYNAMIC_ARCH= | |||
| @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector | |||
| FCOMMON_OPT += -march=z13 -mzvector | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| CCOMMON_OPT += -march=z14 -mzvector | |||
| FCOMMON_OPT += -march=z14 -mzvector | |||
| endif | |||
| @@ -48,6 +48,7 @@ POWER5 | |||
| POWER6 | |||
| POWER7 | |||
| POWER8 | |||
| POWER9 | |||
| PPCG4 | |||
| PPC970 | |||
| PPC970MP | |||
| @@ -90,7 +91,9 @@ CORTEXA73 | |||
| FALKOR | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| TSV110 | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| Z13 | |||
| Z14 | |||
| @@ -53,9 +53,9 @@ before_build: | |||
| - ps: if (-Not (Test-Path .\build)) { mkdir build } | |||
| - cd build | |||
| - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
| build_script: | |||
| - cmake --build . | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,29 +28,21 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| A <- matrix(rnorm(n * n), nrow = n) | |||
| ev <- 0 | |||
| z <- system.time(for (l in 1:loops) { | |||
| ev <- eigen(A) | |||
| }) | |||
| mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,26 +28,13 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(runif(n * n), | |||
| ncol = n, | |||
| nrow = n, | |||
| byrow = TRUE) | |||
| B <- matrix(runif(n * n), | |||
| ncol = n, | |||
| nrow = n, | |||
| byrow = TRUE) | |||
| A <- matrix(runif(n * n), nrow = n) | |||
| B <- matrix(runif(n * n), nrow = n) | |||
| C <- 1 | |||
| z <- system.time(for (l in 1:loops) { | |||
| @@ -54,11 +42,10 @@ while (n <= nto) { | |||
| l <- l + 1 | |||
| }) | |||
| mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,31 +28,22 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| B <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| A <- matrix(rnorm(n * n), nrow = n) | |||
| B <- matrix(rnorm(n * n), nrow = n) | |||
| z <- system.time(for (l in 1:loops) { | |||
| solve(A, B) | |||
| }) | |||
| mflops <- | |||
| (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -1,7 +1,7 @@ | |||
| #!/usr/bin/perl | |||
| use File::Basename; | |||
| use File::Temp qw(tempfile); | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $hostarch = "zarch" if ($hostarch eq "s390x"); | |||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||
| #$tmpf = new File::Temp( UNLINK => 1 ); | |||
| $binary = $ENV{"BINARY"}; | |||
| $makefile = shift(@ARGV); | |||
| @@ -31,12 +31,25 @@ if ($?) { | |||
| $cross_suffix = ""; | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| eval "use File::Basename"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Basename, emulating its functionality"; | |||
| my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||
| if ($dirnam ne ".") { | |||
| $cross_suffix .= $dirnam . "/"; | |||
| } | |||
| my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||
| if ($basnam =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } else { | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } | |||
| $compiler = ""; | |||
| @@ -171,20 +184,26 @@ if ($?) { | |||
| $have_msa = 0; | |||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||
| } else { | |||
| $have_msa = 1; | |||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| } else { | |||
| $have_msa = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| @@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||
| $no_avx512 = 0; | |||
| } else { | |||
| # $tmpf = new File::Temp( UNLINK => 1 ); | |||
| ($fh,$tmpf) = tempfile( UNLINK => 1 ); | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| @@ -73,6 +73,11 @@ double cblas_dasum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS | |||
| float cblas_scasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzasum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_ssum (OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dsum (OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_scsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| double cblas_dzsum(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| float cblas_snrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST float *X, OPENBLAS_CONST blasint incX); | |||
| double cblas_dnrm2 (OPENBLAS_CONST blasint N, OPENBLAS_CONST double *X, OPENBLAS_CONST blasint incX); | |||
| float cblas_scnrm2(OPENBLAS_CONST blasint N, OPENBLAS_CONST void *X, OPENBLAS_CONST blasint incX); | |||
| @@ -88,6 +93,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE | |||
| CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_ismin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_idmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_icmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| CBLAS_INDEX cblas_izmin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx); | |||
| void cblas_saxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy); | |||
| @@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_CORE) | |||
| @@ -107,6 +107,12 @@ macro(SetDefaultL1) | |||
| set(DAXPBYKERNEL ../arm/axpby.c) | |||
| set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(SSUMKERNEL sum.S) | |||
| set(DSUMKERNEL sum.S) | |||
| set(CSUMKERNEL zsum.S) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| @@ -162,4 +168,4 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| endmacro () | |||
| endmacro () | |||
| @@ -8,6 +8,11 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") | |||
| set(EXTRALIB "${EXTRALIB} -lm") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| if (${CMAKE_SYSTEM_NAME} STREQUAL "AIX") | |||
| set(EXTRALIB "${EXTRALIB} -lm") | |||
| endif () | |||
| @@ -87,13 +87,18 @@ endif () | |||
| # Cannot run getarch on target if we are cross-compiling | |||
| if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSSTORE")) | |||
| # Write to config as getarch would | |||
| if (DEFINED TARGET_CORE) | |||
| set(TCORE ${TARGET_CORE}) | |||
| else() | |||
| set(TCORE ${CORE}) | |||
| endif() | |||
| # TODO: Set up defines that getarch sets up based on every other target | |||
| # Perhaps this should be inside a different file as it grows larger | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ${CORE}\n" | |||
| "#define CHAR_CORENAME \"${CORE}\"\n") | |||
| if ("${CORE}" STREQUAL "ARMV7") | |||
| "#define ${TCORE}\n" | |||
| "#define CHAR_CORENAME \"${TCORE}\"\n") | |||
| if ("${TCORE}" STREQUAL "ARMV7") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t32\n" | |||
| @@ -108,7 +113,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| set(DGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "ARMV8") | |||
| elseif ("${TCORE}" STREQUAL "ARMV8") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE\t32768\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| @@ -118,9 +123,16 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define L2_ASSOCIATIVE\t32\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 4) | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "CORTEXA57" OR "${CORE}" STREQUAL "CORTEXA53") | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -144,9 +156,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "CORTEXA72" OR "${CORE}" STREQUAL "CORTEXA73") | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA72" OR "${TCORE}" STREQUAL "CORTEXA73") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t49152\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -170,9 +183,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "FALKOR") | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "FALKOR") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -196,9 +210,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| elseif ("${CORE}" STREQUAL "THUNDERX) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -224,7 +239,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| elseif ("${CORE}" STREQUAL "THUNDERX2T99) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX2T99") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -240,7 +256,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define L3_ASSOCIATIVE\t32\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define VULCAN\n") | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| @@ -249,6 +265,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| endif() | |||
| # Or should this actually be NUM_CORES? | |||
| @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| set(TARGET "BARCELONA") | |||
| endif () | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||
| set(TARGET "ARMV7") | |||
| endif () | |||
| endif () | |||
| if (DEFINED TARGET) | |||
| @@ -184,6 +187,13 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") | |||
| foreach(DCORE ${DYNAMIC_LIST}) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") | |||
| endforeach () | |||
| endif () | |||
| if (NO_LAPACK) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") | |||
| #Disable LAPACK C interface | |||
| @@ -39,13 +39,21 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| set(X86_64 1) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| set(X86_64 1) | |||
| else() | |||
| set(X86 1) | |||
| endif() | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") | |||
| set(ARM 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64.*|AARCH64.*)") | |||
| set(ARM64 1) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| set(ARM64 1) | |||
| else() | |||
| set(ARM 1) | |||
| endif() | |||
| endif() | |||
| if (X86_64) | |||
| @@ -78,7 +86,7 @@ endif() | |||
| if (X86_64 OR X86) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| if (NO_AVX512 EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| @@ -85,6 +85,8 @@ extern "C" { | |||
| #if !defined(_MSC_VER) | |||
| #include <unistd.h> | |||
| #elif _MSC_VER < 1900 | |||
| #define snprintf _snprintf | |||
| #endif | |||
| #include <time.h> | |||
| @@ -348,6 +350,11 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER9 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| /* | |||
| #ifdef PILEDRIVER | |||
| @@ -439,7 +446,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) 0 | |||
| #else | |||
| #ifdef OS_WINDOWS | |||
| #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | |||
| #else | |||
| @@ -19,6 +19,7 @@ | |||
| #define CDOTC_K cdotc_k | |||
| #define CNRM2_K cnrm2_k | |||
| #define CSCAL_K cscal_k | |||
| #define CSUM_K csum_k | |||
| #define CSWAP_K cswap_k | |||
| #define CROT_K csrot_k | |||
| @@ -249,6 +250,7 @@ | |||
| #define CDOTC_K gotoblas -> cdotc_k | |||
| #define CNRM2_K gotoblas -> cnrm2_k | |||
| #define CSCAL_K gotoblas -> cscal_k | |||
| #define CSUM_K gotoblas -> csum_k | |||
| #define CSWAP_K gotoblas -> cswap_k | |||
| #define CROT_K gotoblas -> csrot_k | |||
| @@ -19,6 +19,7 @@ | |||
| #define DDOTC_K ddot_k | |||
| #define DNRM2_K dnrm2_k | |||
| #define DSCAL_K dscal_k | |||
| #define DSUM_K dsum_k | |||
| #define DSWAP_K dswap_k | |||
| #define DROT_K drot_k | |||
| @@ -174,6 +175,7 @@ | |||
| #define DDOTC_K gotoblas -> ddot_k | |||
| #define DNRM2_K gotoblas -> dnrm2_k | |||
| #define DSCAL_K gotoblas -> dscal_k | |||
| #define DSUM_K gotoblas -> dsum_k | |||
| #define DSWAP_K gotoblas -> dswap_k | |||
| #define DROT_K gotoblas -> drot_k | |||
| @@ -122,6 +122,13 @@ xdouble BLASFUNC(qasum) (blasint *, xdouble *, blasint *); | |||
| double BLASFUNC(dzasum)(blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qxasum)(blasint *, xdouble *, blasint *); | |||
| FLOATRET BLASFUNC(ssum) (blasint *, float *, blasint *); | |||
| FLOATRET BLASFUNC(scsum)(blasint *, float *, blasint *); | |||
| double BLASFUNC(dsum) (blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qsum) (blasint *, xdouble *, blasint *); | |||
| double BLASFUNC(dzsum)(blasint *, double *, blasint *); | |||
| xdouble BLASFUNC(qxsum)(blasint *, xdouble *, blasint *); | |||
| blasint BLASFUNC(isamax)(blasint *, float *, blasint *); | |||
| blasint BLASFUNC(idamax)(blasint *, double *, blasint *); | |||
| blasint BLASFUNC(iqamax)(blasint *, xdouble *, blasint *); | |||
| @@ -100,6 +100,13 @@ float casum_k (BLASLONG, float *, BLASLONG); | |||
| double zasum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble xasum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float ssum_k (BLASLONG, float *, BLASLONG); | |||
| double dsum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble qsum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float csum_k (BLASLONG, float *, BLASLONG); | |||
| double zsum_k (BLASLONG, double *, BLASLONG); | |||
| xdouble xsum_k (BLASLONG, xdouble *, BLASLONG); | |||
| float samax_k (BLASLONG, float *, BLASLONG); | |||
| double damax_k (BLASLONG, double *, BLASLONG); | |||
| xdouble qamax_k (BLASLONG, xdouble *, BLASLONG); | |||
| @@ -66,6 +66,7 @@ | |||
| #define DOTC_K QDOTC_K | |||
| #define NRM2_K QNRM2_K | |||
| #define SCAL_K QSCAL_K | |||
| #define SUM_K QSUM_K | |||
| #define SWAP_K QSWAP_K | |||
| #define ROT_K QROT_K | |||
| @@ -356,6 +357,7 @@ | |||
| #define DOTC_K DDOTC_K | |||
| #define NRM2_K DNRM2_K | |||
| #define SCAL_K DSCAL_K | |||
| #define SUM_K DSUM_K | |||
| #define SWAP_K DSWAP_K | |||
| #define ROT_K DROT_K | |||
| @@ -658,6 +660,7 @@ | |||
| #define DOTC_K SDOTC_K | |||
| #define NRM2_K SNRM2_K | |||
| #define SCAL_K SSCAL_K | |||
| #define SUM_K SSUM_K | |||
| #define SWAP_K SSWAP_K | |||
| #define ROT_K SROT_K | |||
| @@ -962,6 +965,7 @@ | |||
| #define DOTC_K XDOTC_K | |||
| #define NRM2_K XNRM2_K | |||
| #define SCAL_K XSCAL_K | |||
| #define SUM_K XSUM_K | |||
| #define SWAP_K XSWAP_K | |||
| #define ROT_K XROT_K | |||
| @@ -1363,6 +1367,7 @@ | |||
| #define DOTC_K ZDOTC_K | |||
| #define NRM2_K ZNRM2_K | |||
| #define SCAL_K ZSCAL_K | |||
| #define SUM_K ZSUM_K | |||
| #define SWAP_K ZSWAP_K | |||
| #define ROT_K ZROT_K | |||
| @@ -1785,6 +1790,7 @@ | |||
| #define DOTC_K CDOTC_K | |||
| #define NRM2_K CNRM2_K | |||
| #define SCAL_K CSCAL_K | |||
| #define SUM_K CSUM_K | |||
| #define SWAP_K CSWAP_K | |||
| #define ROT_K CROT_K | |||
| @@ -63,6 +63,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*snrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*sasum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*ssum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -154,6 +155,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dnrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dasum_k) (BLASLONG, double *, BLASLONG); | |||
| double (*dsum_k) (BLASLONG, double *, BLASLONG); | |||
| int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); | |||
| @@ -245,6 +247,7 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*qcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*qdot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| int (*qrot_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble, xdouble); | |||
| @@ -332,6 +335,7 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| float (*cnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*casum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*csum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -495,6 +499,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
| double (*znrm2_k) (BLASLONG, double *, BLASLONG); | |||
| double (*zasum_k) (BLASLONG, double *, BLASLONG); | |||
| double (*zsum_k) (BLASLONG, double *, BLASLONG); | |||
| int (*zcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotu_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| openblas_complex_double (*zdotc_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); | |||
| @@ -660,6 +665,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xnrm2_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xasum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| xdouble (*xsum_k) (BLASLONG, xdouble *, BLASLONG); | |||
| int (*xcopy_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotu_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| openblas_complex_xdouble (*xdotc_k) (BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| @@ -39,7 +39,7 @@ | |||
| #ifndef COMMON_POWER | |||
| #define COMMON_POWER | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
| #else | |||
| @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -263,7 +263,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| @@ -598,9 +598,14 @@ REALNAME:;\ | |||
| #ifndef __64BIT__ | |||
| #define PROLOGUE \ | |||
| .machine "any";\ | |||
| .toc;\ | |||
| .globl .REALNAME;\ | |||
| .globl REALNAME;\ | |||
| .csect REALNAME[DS],3;\ | |||
| REALNAME:;\ | |||
| .long .REALNAME, TOC[tc0], 0;\ | |||
| .csect .text[PR],5;\ | |||
| .REALNAME:; | |||
| .REALNAME: | |||
| #define EPILOGUE \ | |||
| _section_.text:;\ | |||
| @@ -611,9 +616,14 @@ _section_.text:;\ | |||
| #define PROLOGUE \ | |||
| .machine "any";\ | |||
| .toc;\ | |||
| .globl .REALNAME;\ | |||
| .globl REALNAME;\ | |||
| .csect REALNAME[DS],3;\ | |||
| REALNAME:;\ | |||
| .llong .REALNAME, TOC[tc0], 0;\ | |||
| .csect .text[PR], 5;\ | |||
| .REALNAME:; | |||
| .REALNAME: | |||
| #define EPILOGUE \ | |||
| _section_.text:;\ | |||
| @@ -802,7 +812,7 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) | |||
| #elif defined(POWER8) || defined(POWER9) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| @@ -19,6 +19,7 @@ | |||
| #define QDOTC_K qdot_k | |||
| #define QNRM2_K qnrm2_k | |||
| #define QSCAL_K qscal_k | |||
| #define QSUM_K qsum_k | |||
| #define QSWAP_K qswap_k | |||
| #define QROT_K qrot_k | |||
| @@ -161,6 +162,7 @@ | |||
| #define QDOTC_K gotoblas -> qdot_k | |||
| #define QNRM2_K gotoblas -> qnrm2_k | |||
| #define QSCAL_K gotoblas -> qscal_k | |||
| #define QSUM_K gotoblas -> qsum_k | |||
| #define QSWAP_K gotoblas -> qswap_k | |||
| #define QROT_K gotoblas -> qrot_k | |||
| @@ -12,6 +12,7 @@ | |||
| #define ISMAX_K ismax_k | |||
| #define ISMIN_K ismin_k | |||
| #define SASUM_K sasum_k | |||
| #define SSUM_K ssum_k | |||
| #define SAXPYU_K saxpy_k | |||
| #define SAXPYC_K saxpy_k | |||
| #define SCOPY_K scopy_k | |||
| @@ -170,6 +171,7 @@ | |||
| #define ISMAX_K gotoblas -> ismax_k | |||
| #define ISMIN_K gotoblas -> ismin_k | |||
| #define SASUM_K gotoblas -> sasum_k | |||
| #define SSUM_K gotoblas -> ssum_k | |||
| #define SAXPYU_K gotoblas -> saxpy_k | |||
| #define SAXPYC_K gotoblas -> saxpy_k | |||
| #define SCOPY_K gotoblas -> scopy_k | |||
| @@ -19,6 +19,7 @@ | |||
| #define XDOTC_K xdotc_k | |||
| #define XNRM2_K xnrm2_k | |||
| #define XSCAL_K xscal_k | |||
| #define XSUM_K xsum_k | |||
| #define XSWAP_K xswap_k | |||
| #define XROT_K xqrot_k | |||
| @@ -227,6 +228,7 @@ | |||
| #define XDOTC_K gotoblas -> xdotc_k | |||
| #define XNRM2_K gotoblas -> xnrm2_k | |||
| #define XSCAL_K gotoblas -> xscal_k | |||
| #define XSUM_K gotoblas -> xsum_k | |||
| #define XSWAP_K gotoblas -> xswap_k | |||
| #define XROT_K gotoblas -> xqrot_k | |||
| @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); | |||
| return result; | |||
| #endif | |||
| @@ -134,7 +134,7 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
| "=b" (*ebx), | |||
| "=c" (*ecx), | |||
| "=d" (*edx) | |||
| : "0" (op)); | |||
| : "0" (op), "c"(0)); | |||
| #endif | |||
| } | |||
| @@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||
| return result; | |||
| } | |||
| @@ -19,6 +19,7 @@ | |||
| #define ZDOTC_K zdotc_k | |||
| #define ZNRM2_K znrm2_k | |||
| #define ZSCAL_K zscal_k | |||
| #define ZSUM_K zsum_k | |||
| #define ZSWAP_K zswap_k | |||
| #define ZROT_K zdrot_k | |||
| @@ -249,6 +250,7 @@ | |||
| #define ZDOTC_K gotoblas -> zdotc_k | |||
| #define ZNRM2_K gotoblas -> znrm2_k | |||
| #define ZSCAL_K gotoblas -> zscal_k | |||
| #define ZSUM_K gotoblas -> zsum_k | |||
| #define ZSWAP_K gotoblas -> zswap_k | |||
| #define ZROT_K gotoblas -> zdrot_k | |||
| @@ -53,6 +53,7 @@ | |||
| #define VENDOR_SIS 8 | |||
| #define VENDOR_TRANSMETA 9 | |||
| #define VENDOR_NSC 10 | |||
| #define VENDOR_HYGON 11 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -116,6 +117,7 @@ | |||
| #define CORE_EXCAVATOR 26 | |||
| #define CORE_ZEN 27 | |||
| #define CORE_SKYLAKEX 28 | |||
| #define CORE_DHYANA 29 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -139,6 +141,7 @@ | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define HAVE_FMA3 (1 << 20) | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -214,5 +217,8 @@ typedef struct { | |||
| #define CPUTYPE_EXCAVATOR 50 | |||
| #define CPUTYPE_ZEN 51 | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #define CPUTYPE_DHYANA 53 | |||
| #define CPUTYPE_HYGON_UNKNOWN 54 | |||
| #endif | |||
| @@ -39,6 +39,8 @@ | |||
| // Cavium | |||
| #define CPU_THUNDERX 7 | |||
| #define CPU_THUNDERX2T99 8 | |||
| //Hisilicon | |||
| #define CPU_TSV110 9 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -49,7 +51,8 @@ static char *cpuname[] = { | |||
| "CORTEXA73", | |||
| "FALKOR", | |||
| "THUNDERX", | |||
| "THUNDERX2T99" | |||
| "THUNDERX2T99", | |||
| "TSV110" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -61,7 +64,8 @@ static char *cpuname_lower[] = { | |||
| "cortexa73", | |||
| "falkor", | |||
| "thunderx", | |||
| "thunderx2t99" | |||
| "thunderx2t99", | |||
| "tsv110" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -145,6 +149,9 @@ int detect(void) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
| return CPU_THUNDERX2T99; | |||
| // HiSilicon | |||
| else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||
| return CPU_TSV110; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -286,6 +293,21 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -94,7 +94,7 @@ char *corename[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER8" | |||
| "POWER9" | |||
| }; | |||
| int detect(void){ | |||
| @@ -124,7 +124,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -156,7 +156,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| @@ -180,7 +180,7 @@ int id; | |||
| __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x4e: // POWER9 | |||
| return CPUTYPE_POWER8; | |||
| return CPUTYPE_POWER9; | |||
| break; | |||
| case 0x4d: | |||
| case 0x4b: // POWER8/8E | |||
| @@ -97,10 +97,10 @@ static C_INLINE void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
| ("mov %%ebx, %%edi;" | |||
| "cpuid;" | |||
| "xchgl %%ebx, %%edi;" | |||
| : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
| : "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op), "c" (0) : "cc"); | |||
| #else | |||
| __asm__ __volatile__ | |||
| ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) : "cc"); | |||
| ("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "a" (op) , "c" (0) : "cc"); | |||
| #endif | |||
| } | |||
| @@ -211,6 +211,44 @@ int support_avx(){ | |||
| #endif | |||
| } | |||
| int support_avx2(){ | |||
| #ifndef NO_AVX2 | |||
| int eax, ebx, ecx=0, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 0) | |||
| ret=1; //OS supports AVX2 | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int support_avx512(){ | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & 32) != 32){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| @@ -233,6 +271,7 @@ int get_vendor(void){ | |||
| if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | |||
| if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | |||
| if (!strcmp(vendor, "Geode by NSC")) return VENDOR_NSC; | |||
| if (!strcmp(vendor, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -294,6 +333,8 @@ int get_cputype(int gettype){ | |||
| if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; | |||
| #ifndef NO_AVX | |||
| if (support_avx()) feature |= HAVE_AVX; | |||
| if (support_avx2()) feature |= HAVE_AVX2; | |||
| if (support_avx512()) feature |= HAVE_AVX512VL; | |||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
| #endif | |||
| @@ -1006,7 +1047,9 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| } | |||
| } | |||
| if ((get_vendor() == VENDOR_AMD) || (get_vendor() == VENDOR_CENTAUR)) { | |||
| if ((get_vendor() == VENDOR_AMD) || | |||
| (get_vendor() == VENDOR_HYGON) || | |||
| (get_vendor() == VENDOR_CENTAUR)) { | |||
| cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | |||
| LDTB.size = 4096; | |||
| @@ -1228,22 +1271,18 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| case 15: | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| @@ -1252,33 +1291,27 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 5: | |||
| case 6: | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: | |||
| case 15: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: | |||
| //Skylake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| @@ -1292,80 +1325,66 @@ int get_cpuname(void){ | |||
| switch (model) { | |||
| case 6: | |||
| //Broadwell | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| #ifndef NO_AVX512 | |||
| return CPUTYPE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| #endif | |||
| case 14: | |||
| // Skylake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: | |||
| // Xeon Phi Knights Landing | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| // Apollo Lake | |||
| case 15: | |||
| // Denverton | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| switch (model) { | |||
| case 6: // Cannon Lake | |||
| #ifndef NO_AVX512 | |||
| return CPUTYPE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| #endif | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| case 8: | |||
| switch (model) { | |||
| case 14: // Kaby Lake | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| #else | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| @@ -1469,6 +1488,26 @@ int get_cpuname(void){ | |||
| return CPUTYPE_AMD_UNKNOWN; | |||
| } | |||
| if (vendor == VENDOR_HYGON){ | |||
| switch (family) { | |||
| case 0xf: | |||
| switch (exfamily) { | |||
| case 9: | |||
| //Hygon Dhyana | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| } | |||
| return CPUTYPE_HYGON_UNKNOWN; | |||
| } | |||
| if (vendor == VENDOR_CYRIX){ | |||
| switch (family) { | |||
| case 0x4: | |||
| @@ -1590,7 +1629,8 @@ static char *cpuname[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX" | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1645,7 +1685,8 @@ static char *lowercpuname[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex" | |||
| "skylakex", | |||
| "dhyana" | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1677,7 +1718,8 @@ static char *corename[] = { | |||
| "STEAMROLLER", | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX" | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1709,7 +1751,8 @@ static char *corename_lower[] = { | |||
| "steamroller", | |||
| "excavator", | |||
| "zen", | |||
| "skylakex" | |||
| "skylakex", | |||
| "dhyana" | |||
| }; | |||
| @@ -2026,6 +2069,23 @@ int get_coretype(void){ | |||
| } | |||
| } | |||
| if (vendor == VENDOR_HYGON){ | |||
| if (family == 0xf){ | |||
| if (exfamily == 9) { | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_ZEN; | |||
| #else | |||
| return CORE_SANDYBRIDGE; // closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CORE_BARCELONA; | |||
| } else { | |||
| return CORE_BARCELONA; | |||
| } | |||
| } | |||
| } | |||
| if (vendor == VENDOR_CENTAUR) { | |||
| switch (family) { | |||
| case 0x6: | |||
| @@ -2112,6 +2172,8 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); | |||
| if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); | |||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
| if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
| if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| @@ -2180,6 +2242,8 @@ void get_sse(void){ | |||
| if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); | |||
| if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); | |||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
| if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
| if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| @@ -27,9 +27,9 @@ | |||
| #include <string.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| @@ -64,10 +64,8 @@ int detect(void) | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| /* detect z14, but fall back to z13 */ | |||
| if (strstr(p, "3906")) return CPU_Z13; | |||
| if (strstr(p, "3907")) return CPU_Z13; | |||
| if (strstr(p, "3906")) return CPU_Z14; | |||
| if (strstr(p, "3907")) return CPU_Z14; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -116,7 +114,14 @@ void get_cpuconfig(void) | |||
| break; | |||
| case CPU_Z14: | |||
| printf("#define Z14\n"); | |||
| printf("#define L1_DATA_SIZE 131072\n"); | |||
| printf("#define L1_DATA_LINESIZE 256\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8\n"); | |||
| printf("#define L2_SIZE 4194304\n"); | |||
| printf("#define L2_LINESIZE 256\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -113,7 +113,7 @@ ARCH_X86 | |||
| ARCH_X86_64 | |||
| #endif | |||
| #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) | |||
| #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) | |||
| ARCH_POWER | |||
| #endif | |||
| @@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -18,8 +18,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH),arm64) | |||
| COMMONOBJS += dynamic_arm64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),power) | |||
| COMMONOBJS += dynamic_power.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -78,8 +82,12 @@ ifeq ($(DYNAMIC_ARCH), 1) | |||
| ifeq ($(ARCH),arm64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_arm64.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),power) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_power.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| SetEvent(pool.killed); | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| // Could also just use WaitForMultipleObjects | |||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
| #ifndef OS_WINDOWSSTORE | |||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
| TerminateThread(blas_threads[i],0); | |||
| #endif | |||
| CloseHandle(blas_threads[i]); | |||
| } | |||
| CloseHandle(pool.filled); | |||
| CloseHandle(pool.killed); | |||
| blas_server_avail = 0; | |||
| } | |||
| @@ -274,6 +274,7 @@ extern gotoblas_t gotoblas_SKYLAKEX; | |||
| #define VENDOR_INTEL 1 | |||
| #define VENDOR_AMD 2 | |||
| #define VENDOR_CENTAUR 3 | |||
| #define VENDOR_HYGON 4 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -304,9 +305,49 @@ int support_avx(){ | |||
| #endif | |||
| } | |||
| int support_avx2(){ | |||
| #ifndef NO_AVX2 | |||
| int eax, ebx, ecx=0, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 0) | |||
| ret=1; //OS supports AVX2 | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int support_avx512(){ | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx()) | |||
| return 0; | |||
| cpuid(7, &eax, &ebx, &ecx, &edx); | |||
| if((ebx & (1<<7)) != 1){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define FALLBACK_VERBOSE 1 | |||
| #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" | |||
| #define SANDYBRIDGE_FALLBACK "OpenBLAS : Your OS does not support AVX2 instructions. OpenBLAS is using Sandybridge kernels as a fallback, which may give poorer performance.\n" | |||
| #define HASWELL_FALLBACK "OpenBLAS : Your OS does not support AVX512VL instructions. OpenBLAS is using Haswell kernels as a fallback, which may give poorer performance.\n" | |||
| #define BARCELONA_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n" | |||
| static int get_vendor(void){ | |||
| @@ -329,6 +370,7 @@ static int get_vendor(void){ | |||
| if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | |||
| if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | |||
| if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -403,18 +445,24 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| //Intel Haswell | |||
| if (model == 12 || model == 15) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Broadwell | |||
| if (model == 13) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -424,27 +472,36 @@ static gotoblas_t *get_coretype(void){ | |||
| case 4: | |||
| //Intel Haswell | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Broadwell | |||
| if (model == 7 || model == 15) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Skylake | |||
| if (model == 14) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -457,72 +514,86 @@ static gotoblas_t *get_coretype(void){ | |||
| case 5: | |||
| //Intel Broadwell | |||
| if (model == 6) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| if (model == 5) { | |||
| // Intel Skylake X | |||
| #ifndef NO_AVX512 | |||
| return &gotoblas_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| #endif | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| //Intel Skylake | |||
| if (model == 14) { | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Intel Phi Knights Landing | |||
| if (model == 7) { | |||
| if(support_avx()) | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Apollo Lake | |||
| if (model == 12) { | |||
| //Apollo Lake or Denverton | |||
| if (model == 12 || model == 15) { | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| case 6: | |||
| if (model == 6) { | |||
| // Cannon Lake | |||
| #ifndef NO_AVX512 | |||
| return &gotoblas_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return &gotoblas_HASWELL; | |||
| #else | |||
| return &gotoblas_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return &gotoblas_NEHALEM; | |||
| #endif | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 14 ) { // Kaby Lake | |||
| if(support_avx()) | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| else{ | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| @@ -535,7 +606,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| if (vendor == VENDOR_AMD){ | |||
| if (vendor == VENDOR_AMD || vendor == VENDOR_HYGON){ | |||
| if (family <= 0xe) { | |||
| // Verify that CPU has 3dnow and 3dnowext before claiming it is Athlon | |||
| cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |||
| @@ -615,6 +686,13 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 9) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| @@ -0,0 +1,102 @@ | |||
| #include "common.h" | |||
| extern gotoblas_t gotoblas_POWER6; | |||
| extern gotoblas_t gotoblas_POWER8; | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| extern void openblas_warning(int verbose, const char *msg); | |||
| static char *corename[] = { | |||
| "unknown", | |||
| "POWER6", | |||
| "POWER8", | |||
| "POWER9" | |||
| }; | |||
| #define NUM_CORETYPES 4 | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| return corename[0]; | |||
| } | |||
| static gotoblas_t *get_coretype(void) { | |||
| if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||
| return &gotoblas_POWER6; | |||
| if (__builtin_cpu_is("power8")) | |||
| return &gotoblas_POWER8; | |||
| if (__builtin_cpu_is("power9")) | |||
| return &gotoblas_POWER9; | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *force_coretype(char * coretype) { | |||
| int i ; | |||
| int found = -1; | |||
| char message[128]; | |||
| for ( i = 0 ; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| case 1: return (&gotoblas_POWER6); | |||
| case 2: return (&gotoblas_POWER8); | |||
| case 3: return (&gotoblas_POWER9); | |||
| default: return NULL; | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char coren[22]; | |||
| char *p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if ( p ) | |||
| { | |||
| gotoblas = force_coretype(p); | |||
| } | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| } | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to POWER8 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_POWER8; | |||
| } | |||
| if (gotoblas && gotoblas -> init) { | |||
| strncpy(coren,gotoblas_corename(),20); | |||
| sprintf(coremsg, "Core: %s\n",coren); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -198,45 +198,68 @@ int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| cpu_set_t *cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| int i,n; | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int n; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return nums; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| #else | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) return nums; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) return nums; | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -1073,11 +1096,6 @@ static volatile int memory_initialized = 0; | |||
| } | |||
| free(table); | |||
| } | |||
| #if defined(OS_WINDOWS) | |||
| TlsFree(local_storage_key); | |||
| #else | |||
| pthread_key_delete(local_storage_key); | |||
| #endif | |||
| } | |||
| static void blas_memory_init(){ | |||
| @@ -1295,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { | |||
| free(map_address); | |||
| } | |||
| #ifdef SMP | |||
| void blas_thread_memory_cleanup(void) { | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| } | |||
| #endif | |||
| void blas_shutdown(void){ | |||
| #ifdef SMP | |||
| BLASFUNC(blas_thread_shutdown)(); | |||
| @@ -1304,7 +1329,7 @@ void blas_shutdown(void){ | |||
| /* Only cleanupIf we were built for threading and TLS was initialized */ | |||
| if (local_storage_key) | |||
| #endif | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| blas_thread_memory_cleanup(); | |||
| #ifdef SEEK_ADDRESS | |||
| base_address = 0UL; | |||
| @@ -1491,6 +1516,14 @@ void DESTRUCTOR gotoblas_quit(void) { | |||
| blas_shutdown(); | |||
| #if defined(SMP) | |||
| #if defined(OS_WINDOWS) | |||
| TlsFree(local_storage_key); | |||
| #else | |||
| pthread_key_delete(local_storage_key); | |||
| #endif | |||
| #endif | |||
| #ifdef PROFILE | |||
| moncontrol (0); | |||
| #endif | |||
| @@ -1526,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser | |||
| break; | |||
| case DLL_THREAD_DETACH: | |||
| #if defined(SMP) | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| blas_thread_memory_cleanup(); | |||
| #endif | |||
| break; | |||
| case DLL_PROCESS_DETACH: | |||
| @@ -1600,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #endif | |||
| #else | |||
| /* USE_TLS / COMPILE_TLS not set */ | |||
| #include <errno.h> | |||
| #ifdef OS_WINDOWS | |||
| #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
| #define ALLOC_WINDOWS | |||
| #ifndef MEM_LARGE_PAGES | |||
| #define MEM_LARGE_PAGES 0x20000000 | |||
| @@ -1616,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #include <stdio.h> | |||
| #include <fcntl.h> | |||
| #ifndef OS_WINDOWS | |||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -1636,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||
| #include <sys/sysctl.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| @@ -1675,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #else | |||
| #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) | |||
| #define CONSTRUCTOR __attribute__ ((constructor(101))) | |||
| #define DESTRUCTOR __attribute__ ((destructor(101))) | |||
| #else | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #endif | |||
| #ifdef DYNAMIC_ARCH | |||
| @@ -1701,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {}; | |||
| int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| cpu_set_t *cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| int i,n; | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int n; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return nums; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| #else | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) return nums; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) return nums; | |||
| nums = CPU_COUNT_S(size,cpusetp); | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -1753,7 +1816,7 @@ int get_num_procs(void) { | |||
| return nums; | |||
| } | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| @@ -1790,7 +1853,7 @@ int get_num_procs(void) { | |||
| #endif | |||
| #if defined(OS_FREEBSD) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
| int get_num_procs(void) { | |||
| @@ -1867,7 +1930,7 @@ void openblas_fork_handler() | |||
| // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | |||
| // In the mean time build with USE_OPENMP=0 or link against another | |||
| // implementation of OpenMP. | |||
| #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
| #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
| int err; | |||
| err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | |||
| if(err != 0) | |||
| @@ -1880,7 +1943,7 @@ extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -1888,11 +1951,11 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| blas_goto_num = 0; | |||
| // blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| @@ -1904,7 +1967,7 @@ int blas_get_cpu_number(void){ | |||
| #endif | |||
| blas_omp_num = 0; | |||
| // blas_omp_num = 0; | |||
| blas_omp_num=openblas_omp_num_threads_env(); | |||
| if (blas_omp_num < 0) blas_omp_num = 0; | |||
| @@ -1912,7 +1975,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -1999,11 +2062,15 @@ static void *alloc_mmap(void *address){ | |||
| } | |||
| if (map_address != (void *)-1) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| release_pos ++; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| } | |||
| #ifdef OS_LINUX | |||
| @@ -2145,14 +2212,18 @@ static void *alloc_mmap(void *address){ | |||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | |||
| } | |||
| #endif | |||
| LOCK_COMMAND(&alloc_lock); | |||
| if (map_address != (void *)-1) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| release_pos ++; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| return map_address; | |||
| } | |||
| @@ -2520,7 +2591,7 @@ void *blas_memory_alloc(int procpos){ | |||
| int position; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| int mypos; | |||
| int mypos = 0; | |||
| #endif | |||
| void *map_address; | |||
| @@ -2551,6 +2622,11 @@ void *blas_memory_alloc(int procpos){ | |||
| NULL, | |||
| }; | |||
| void *(**func)(void *address); | |||
| #if defined(USE_OPENMP) | |||
| if (!memory_initialized) { | |||
| #endif | |||
| LOCK_COMMAND(&alloc_lock); | |||
| if (!memory_initialized) { | |||
| @@ -2586,6 +2662,9 @@ void *blas_memory_alloc(int procpos){ | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #if defined(USE_OPENMP) | |||
| } | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf("Alloc Start ...\n"); | |||
| @@ -2600,13 +2679,17 @@ void *blas_memory_alloc(int procpos){ | |||
| do { | |||
| if (!memory[position].used && (memory[position].pos == mypos)) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| // blas_lock(&memory[position].lock); | |||
| #else | |||
| blas_lock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].used) goto allocation; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| // blas_unlock(&memory[position].lock); | |||
| #else | |||
| blas_unlock(&memory[position].lock); | |||
| #endif | |||
| } | |||
| position ++; | |||
| @@ -2618,21 +2701,26 @@ void *blas_memory_alloc(int procpos){ | |||
| position = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| do { | |||
| /* if (!memory[position].used) { */ | |||
| /* blas_lock(&memory[position].lock);*/ | |||
| #if defined(USE_OPENMP) | |||
| if (!memory[position].used) { | |||
| blas_lock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].used) goto allocation; | |||
| /* blas_unlock(&memory[position].lock);*/ | |||
| /* } */ | |||
| #if defined(USE_OPENMP) | |||
| blas_unlock(&memory[position].lock); | |||
| } | |||
| #endif | |||
| position ++; | |||
| } while (position < NUM_BUFFERS); | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| goto error; | |||
| allocation : | |||
| @@ -2642,10 +2730,11 @@ void *blas_memory_alloc(int procpos){ | |||
| #endif | |||
| memory[position].used = 1; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| /* blas_unlock(&memory[position].lock);*/ | |||
| #else | |||
| blas_unlock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].addr) { | |||
| do { | |||
| #ifdef DEBUG | |||
| @@ -2690,9 +2779,13 @@ void *blas_memory_alloc(int procpos){ | |||
| } while ((BLASLONG)map_address == -1); | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| memory[position].addr = map_address; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | |||
| @@ -2746,8 +2839,9 @@ void blas_memory_free(void *free_area){ | |||
| #endif | |||
| position = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | |||
| position++; | |||
| @@ -2761,7 +2855,9 @@ void blas_memory_free(void *free_area){ | |||
| WMB; | |||
| memory[position].used = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf("Unmap Succeeded.\n\n"); | |||
| @@ -2776,8 +2872,9 @@ void blas_memory_free(void *free_area){ | |||
| for (position = 0; position < NUM_BUFFERS; position++) | |||
| printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | |||
| #endif | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| return; | |||
| } | |||
| @@ -35,12 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <string.h> | |||
| #if defined(_WIN32) && defined(_MSC_VER) | |||
| #if _MSC_VER < 1900 | |||
| #define snprintf _snprintf | |||
| #endif | |||
| #endif | |||
| static char* openblas_config_str="" | |||
| "OpenBLAS " | |||
| VERSION | |||
| @@ -141,6 +141,14 @@ else | |||
| $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed | |||
| ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c | |||
| endif | |||
| ifeq ($(F_COMPILER), INTEL) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else | |||
| ifneq ($(C_COMPILER), LSB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| @@ -152,6 +160,7 @@ else | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| endif | |||
| endif | |||
| rm -f linktest | |||
| @@ -40,15 +40,25 @@ | |||
| void gotoblas_init(void); | |||
| void gotoblas_quit(void); | |||
| #if defined(SMP) && defined(USE_TLS) | |||
| void blas_thread_memory_cleanup(void); | |||
| #endif | |||
| BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { | |||
| if (reason == DLL_PROCESS_ATTACH) { | |||
| gotoblas_init(); | |||
| } | |||
| if (reason == DLL_PROCESS_DETACH) { | |||
| gotoblas_quit(); | |||
| switch(reason) { | |||
| case DLL_PROCESS_ATTACH: | |||
| gotoblas_init(); | |||
| break; | |||
| case DLL_PROCESS_DETACH: | |||
| gotoblas_quit(); | |||
| break; | |||
| case DLL_THREAD_ATTACH: | |||
| break; | |||
| case DLL_THREAD_DETACH: | |||
| #if defined(SMP) && defined(USE_TLS) | |||
| blas_thread_memory_cleanup(); | |||
| #endif | |||
| break; | |||
| } | |||
| return TRUE; | |||
| @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <unistd.h> | |||
| #endif | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #else | |||
| #define NO_AVX512 | |||
| #endif | |||
| /* #define FORCE_P2 */ | |||
| /* #define FORCE_KATMAI */ | |||
| /* #define FORCE_COPPERMINE */ | |||
| @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef FORCE_SKYLAKEX | |||
| #ifdef NO_AVX512 | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #else | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "skylakex" | |||
| #define CORENAME "SKYLAKEX" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| @@ -618,6 +637,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER8" | |||
| #endif | |||
| #if defined(FORCE_POWER9) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER9" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER9 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power9" | |||
| #define CORENAME "POWER9" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| @@ -1046,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_TSV110 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "TSV110" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTSV110 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "tsv110" | |||
| #define CORENAME "TSV110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -1066,8 +1114,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "Z13" | |||
| #endif | |||
| #ifdef FORCE_Z14 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| #define SUBARCHITECTURE "Z14" | |||
| #define ARCHCONFIG "-DZ14 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64" | |||
| #define LIBNAME "z14" | |||
| #define CORENAME "Z14" | |||
| #endif | |||
| #ifndef FORCE | |||
| #ifdef USER_TARGET | |||
| #error "The TARGET specified on the command line or in Makefile.rule is not supported. Please choose a target from TargetList.txt" | |||
| #endif | |||
| #if defined(__powerpc__) || defined(__powerpc) || defined(powerpc) || \ | |||
| defined(__PPC__) || defined(PPC) || defined(_POWER) || defined(__POWERPC__) | |||
| #ifndef POWER | |||
| @@ -12,6 +12,7 @@ set(BLAS1_REAL_ONLY_SOURCES | |||
| rotm.c rotmg.c # N.B. these do not have complex counterparts | |||
| rot.c | |||
| asum.c | |||
| sum.c | |||
| ) | |||
| # these will have 'z' prepended for the complex version | |||
| @@ -124,6 +125,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "scamin" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("max.c" "USE_ABS" "scamax" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("asum.c" "" "scasum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| GenerateNamedObjects("sum.c" "" "scsum" ${CBLAS_FLAG} "" "" true "COMPLEX") | |||
| endif () | |||
| if (${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("zscal.c" "SSCAL" "dscal" ${CBLAS_FLAG} "" "" false "ZCOMPLEX") | |||
| @@ -132,6 +134,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("max.c" "USE_ABS;USE_MIN" "dzamin" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("max.c" "USE_ABS" "dzamax" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("asum.c" "" "dzasum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| GenerateNamedObjects("sum.c" "" "dzsum" ${CBLAS_FLAG} "" "" true "ZCOMPLEX") | |||
| endif () | |||
| endforeach () | |||
| @@ -25,7 +25,7 @@ SBLAS1OBJS = \ | |||
| saxpy.$(SUFFIX) sswap.$(SUFFIX) \ | |||
| scopy.$(SUFFIX) sscal.$(SUFFIX) \ | |||
| sdot.$(SUFFIX) sdsdot.$(SUFFIX) dsdot.$(SUFFIX) \ | |||
| sasum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
| sasum.$(SUFFIX) ssum.$(SUFFIX) snrm2.$(SUFFIX) \ | |||
| smax.$(SUFFIX) samax.$(SUFFIX) ismax.$(SUFFIX) isamax.$(SUFFIX) \ | |||
| smin.$(SUFFIX) samin.$(SUFFIX) ismin.$(SUFFIX) isamin.$(SUFFIX) \ | |||
| srot.$(SUFFIX) srotg.$(SUFFIX) srotm.$(SUFFIX) srotmg.$(SUFFIX) \ | |||
| @@ -51,7 +51,7 @@ DBLAS1OBJS = \ | |||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
| dcopy.$(SUFFIX) dscal.$(SUFFIX) \ | |||
| ddot.$(SUFFIX) \ | |||
| dasum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
| dasum.$(SUFFIX) dsum.$(SUFFIX) dnrm2.$(SUFFIX) \ | |||
| dmax.$(SUFFIX) damax.$(SUFFIX) idmax.$(SUFFIX) idamax.$(SUFFIX) \ | |||
| dmin.$(SUFFIX) damin.$(SUFFIX) idmin.$(SUFFIX) idamin.$(SUFFIX) \ | |||
| drot.$(SUFFIX) drotg.$(SUFFIX) drotm.$(SUFFIX) drotmg.$(SUFFIX) \ | |||
| @@ -76,7 +76,7 @@ CBLAS1OBJS = \ | |||
| caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ | |||
| ccopy.$(SUFFIX) cscal.$(SUFFIX) csscal.$(SUFFIX) \ | |||
| cdotc.$(SUFFIX) cdotu.$(SUFFIX) \ | |||
| scasum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
| scasum.$(SUFFIX) scsum.$(SUFFIX) scnrm2.$(SUFFIX) \ | |||
| scamax.$(SUFFIX) icamax.$(SUFFIX) \ | |||
| scamin.$(SUFFIX) icamin.$(SUFFIX) \ | |||
| csrot.$(SUFFIX) crotg.$(SUFFIX) \ | |||
| @@ -105,7 +105,7 @@ ZBLAS1OBJS = \ | |||
| zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ | |||
| zcopy.$(SUFFIX) zscal.$(SUFFIX) zdscal.$(SUFFIX) \ | |||
| zdotc.$(SUFFIX) zdotu.$(SUFFIX) \ | |||
| dzasum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
| dzasum.$(SUFFIX) dzsum.$(SUFFIX) dznrm2.$(SUFFIX) \ | |||
| dzamax.$(SUFFIX) izamax.$(SUFFIX) \ | |||
| dzamin.$(SUFFIX) izamin.$(SUFFIX) \ | |||
| zdrot.$(SUFFIX) zrotg.$(SUFFIX) \ | |||
| @@ -146,7 +146,7 @@ QBLAS1OBJS = \ | |||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
| qdot.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
| @@ -168,7 +168,7 @@ XBLAS1OBJS = \ | |||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
| xdotc.$(SUFFIX) xdotu.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
| @@ -203,7 +203,7 @@ ifdef QUAD_PRECISION | |||
| QBLAS1OBJS = \ | |||
| qaxpy.$(SUFFIX) qswap.$(SUFFIX) \ | |||
| qcopy.$(SUFFIX) qscal.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qasum.$(SUFFIX) qsum.$(SUFFIX) qnrm2.$(SUFFIX) \ | |||
| qmax.$(SUFFIX) qamax.$(SUFFIX) iqmax.$(SUFFIX) iqamax.$(SUFFIX) \ | |||
| qmin.$(SUFFIX) qamin.$(SUFFIX) iqmin.$(SUFFIX) iqamin.$(SUFFIX) \ | |||
| qrot.$(SUFFIX) qrotg.$(SUFFIX) qrotm.$(SUFFIX) qrotmg.$(SUFFIX) \ | |||
| @@ -224,7 +224,7 @@ QBLAS3OBJS = \ | |||
| XBLAS1OBJS = \ | |||
| xaxpy.$(SUFFIX) xaxpyc.$(SUFFIX) xswap.$(SUFFIX) \ | |||
| xcopy.$(SUFFIX) xscal.$(SUFFIX) xqscal.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxasum.$(SUFFIX) qxsum.$(SUFFIX) qxnrm2.$(SUFFIX) \ | |||
| qxamax.$(SUFFIX) ixamax.$(SUFFIX) \ | |||
| qxamin.$(SUFFIX) ixamin.$(SUFFIX) \ | |||
| xqrot.$(SUFFIX) xrotg.$(SUFFIX) \ | |||
| @@ -263,7 +263,8 @@ CSBLAS1OBJS = \ | |||
| cblas_isamax.$(SUFFIX) cblas_isamin.$(SUFFIX) cblas_sasum.$(SUFFIX) cblas_saxpy.$(SUFFIX) \ | |||
| cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \ | |||
| cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \ | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) | |||
| cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \ | |||
| cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) | |||
| CSBLAS2OBJS = \ | |||
| cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \ | |||
| @@ -280,7 +281,8 @@ CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \ | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) | |||
| cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \ | |||
| cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) | |||
| CDBLAS2OBJS = \ | |||
| cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \ | |||
| @@ -300,7 +302,8 @@ CCBLAS1OBJS = \ | |||
| cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \ | |||
| cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \ | |||
| cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \ | |||
| cblas_caxpby.$(SUFFIX) | |||
| cblas_caxpby.$(SUFFIX) \ | |||
| cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) | |||
| CCBLAS2OBJS = \ | |||
| cblas_cgemv.$(SUFFIX) cblas_cgerc.$(SUFFIX) cblas_cgeru.$(SUFFIX) \ | |||
| @@ -326,7 +329,9 @@ CZBLAS1OBJS = \ | |||
| cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \ | |||
| cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \ | |||
| cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \ | |||
| cblas_zaxpby.$(SUFFIX) | |||
| cblas_zaxpby.$(SUFFIX) \ | |||
| cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) | |||
| CZBLAS2OBJS = \ | |||
| cblas_zgemv.$(SUFFIX) cblas_zgerc.$(SUFFIX) cblas_zgeru.$(SUFFIX) \ | |||
| @@ -560,6 +565,24 @@ dzasum.$(SUFFIX) dzasum.$(PSUFFIX) : asum.c | |||
| qxasum.$(SUFFIX) qxasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| ssum.$(SUFFIX) ssum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| dsum.$(SUFFIX) dsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| qsum.$(SUFFIX) qsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| scsum.$(SUFFIX) scsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| dzsum.$(SUFFIX) dzsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| qxsum.$(SUFFIX) qxsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| snrm2.$(SUFFIX) snrm2.$(PSUFFIX) : nrm2.c | |||
| $(CC) $(CFLAGS) -c $< -o $(@F) | |||
| @@ -1383,6 +1406,18 @@ cblas_ismin.$(SUFFIX) cblas_ismin.$(PSUFFIX) : imax.c | |||
| cblas_idmin.$(SUFFIX) cblas_idmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_icmax.$(SUFFIX) cblas_icmax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_izmax.$(SUFFIX) cblas_izmax.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -UUSE_MIN $< -o $(@F) | |||
| cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F) | |||
| cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1395,6 +1430,18 @@ cblas_scasum.$(SUFFIX) cblas_scasum.$(PSUFFIX) : asum.c | |||
| cblas_dzasum.$(SUFFIX) cblas_dzasum.$(PSUFFIX) : asum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_ssum.$(SUFFIX) cblas_ssum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_dsum.$(SUFFIX) cblas_dsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_scsum.$(SUFFIX) cblas_scsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_dzsum.$(SUFFIX) cblas_dzsum.$(PSUFFIX) : sum.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -1402,7 +1449,7 @@ cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| cblas_ddot.$(SUFFIX) cblas_ddot.$(PSUFFIX) : dot.c | |||
| $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) | |||
| @@ -0,0 +1,97 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #ifdef FUNCTION_PROFILE | |||
| #include "functable.h" | |||
| #endif | |||
| #ifndef CBLAS | |||
| FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){ | |||
| BLASLONG n = *N; | |||
| BLASLONG incx = *INCX; | |||
| FLOATRET ret; | |||
| PRINT_DEBUG_NAME; | |||
| if (n <= 0) return 0; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| ret = (FLOATRET)SUM_K(n, x, incx); | |||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
| IDEBUG_END; | |||
| return ret; | |||
| } | |||
| #else | |||
| #ifdef COMPLEX | |||
| FLOAT CNAME(blasint n, void *vx, blasint incx){ | |||
| FLOAT *x = (FLOAT*) vx; | |||
| #else | |||
| FLOAT CNAME(blasint n, FLOAT *x, blasint incx){ | |||
| #endif | |||
| FLOAT ret; | |||
| PRINT_DEBUG_CNAME; | |||
| if (n <= 0) return 0; | |||
| IDEBUG_START; | |||
| FUNCTION_PROFILE_START(); | |||
| ret = SUM_K(n, x, incx); | |||
| FUNCTION_PROFILE_END(COMPSIZE, n, n); | |||
| IDEBUG_END; | |||
| return ret; | |||
| } | |||
| #endif | |||
| @@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| /* nthreads = num_cpu_avail(2); | |||
| nthreads = num_cpu_avail(2); | |||
| FIXME trmv_thread was found to be broken, see issue 1332 */ | |||
| nthreads = 1; | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -81,6 +81,12 @@ | |||
| #endif | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_FACTOR 256 | |||
| #else | |||
| #define SMP_FACTOR 128 | |||
| #endif | |||
| static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
| #ifndef TRMM | |||
| TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, | |||
| @@ -198,7 +204,7 @@ void NAME(char *SIDE, char *UPLO, char *TRANS, char *DIAG, | |||
| if (side < 0) info = 1; | |||
| if (info != 0) { | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); | |||
| BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)-1); | |||
| return; | |||
| } | |||
| @@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, | |||
| mode |= (trans << BLAS_TRANSA_SHIFT); | |||
| mode |= (side << BLAS_RSIDE_SHIFT); | |||
| if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| /* | |||
| if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| */ | |||
| if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(3); | |||
| @@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } else | |||
| nthreads = 1; | |||
| /* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||
| nthreads = 1; | |||
| if(nthreads > 1) { | |||
| buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
| } | |||
| @@ -65,6 +65,7 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SCALKERNEL}" "" "scal_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SWAPKERNEL}" "" "swap_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPBYKERNEL}" "" "axpby_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}SUMKERNEL}" "" "sum_k" false "" "" false ${float_type}) | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AXPYKERNEL}" "CONJ" "axpyc_k" false "" "" false ${float_type}) | |||
| @@ -340,6 +340,32 @@ ifndef XSCALKERNEL | |||
| XSCALKERNEL = zscal.S | |||
| endif | |||
| ### SUM ### | |||
| ifndef SSUMKERNEL | |||
| SSUMKERNEL = sum.S | |||
| endif | |||
| ifndef DSUMKERNEL | |||
| DSUMKERNEL = sum.S | |||
| endif | |||
| ifndef CSUMKERNEL | |||
| CSUMKERNEL = zsum.S | |||
| endif | |||
| ifndef ZSUMKERNEL | |||
| ZSUMKERNEL = zsum.S | |||
| endif | |||
| ifndef QSUMKERNEL | |||
| QSUMKERNEL = sum.S | |||
| endif | |||
| ifndef XSUMKERNEL | |||
| XSUMKERNEL = zsum.S | |||
| endif | |||
| ### SWAP ### | |||
| ifndef SSWAPKERNEL | |||
| @@ -453,7 +479,7 @@ endif | |||
| SBLASOBJS += \ | |||
| samax_k$(TSUFFIX).$(SUFFIX) samin_k$(TSUFFIX).$(SUFFIX) smax_k$(TSUFFIX).$(SUFFIX) smin_k$(TSUFFIX).$(SUFFIX) \ | |||
| isamax_k$(TSUFFIX).$(SUFFIX) isamin_k$(TSUFFIX).$(SUFFIX) ismax_k$(TSUFFIX).$(SUFFIX) ismin_k$(TSUFFIX).$(SUFFIX) \ | |||
| sasum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| sasum_k$(TSUFFIX).$(SUFFIX) ssum_k$(TSUFFIX).$(SUFFIX) saxpy_k$(TSUFFIX).$(SUFFIX) scopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| sdot_k$(TSUFFIX).$(SUFFIX) sdsdot_k$(TSUFFIX).$(SUFFIX) dsdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| snrm2_k$(TSUFFIX).$(SUFFIX) srot_k$(TSUFFIX).$(SUFFIX) sscal_k$(TSUFFIX).$(SUFFIX) sswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| saxpby_k$(TSUFFIX).$(SUFFIX) | |||
| @@ -463,31 +489,32 @@ DBLASOBJS += \ | |||
| idamax_k$(TSUFFIX).$(SUFFIX) idamin_k$(TSUFFIX).$(SUFFIX) idmax_k$(TSUFFIX).$(SUFFIX) idmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| dasum_k$(TSUFFIX).$(SUFFIX) daxpy_k$(TSUFFIX).$(SUFFIX) dcopy_k$(TSUFFIX).$(SUFFIX) ddot_k$(TSUFFIX).$(SUFFIX) \ | |||
| dnrm2_k$(TSUFFIX).$(SUFFIX) drot_k$(TSUFFIX).$(SUFFIX) dscal_k$(TSUFFIX).$(SUFFIX) dswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) | |||
| daxpby_k$(TSUFFIX).$(SUFFIX) dsum_k$(TSUFFIX).$(SUFFIX) | |||
| QBLASOBJS += \ | |||
| qamax_k$(TSUFFIX).$(SUFFIX) qamin_k$(TSUFFIX).$(SUFFIX) qmax_k$(TSUFFIX).$(SUFFIX) qmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| iqamax_k$(TSUFFIX).$(SUFFIX) iqamin_k$(TSUFFIX).$(SUFFIX) iqmax_k$(TSUFFIX).$(SUFFIX) iqmin_k$(TSUFFIX).$(SUFFIX) \ | |||
| qasum_k$(TSUFFIX).$(SUFFIX) qaxpy_k$(TSUFFIX).$(SUFFIX) qcopy_k$(TSUFFIX).$(SUFFIX) qdot_k$(TSUFFIX).$(SUFFIX) \ | |||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) | |||
| qnrm2_k$(TSUFFIX).$(SUFFIX) qrot_k$(TSUFFIX).$(SUFFIX) qscal_k$(TSUFFIX).$(SUFFIX) qswap_k$(TSUFFIX).$(SUFFIX) \ | |||
| qsum_k$(TSUFFIX).$(SUFFIX) | |||
| CBLASOBJS += \ | |||
| camax_k$(TSUFFIX).$(SUFFIX) camin_k$(TSUFFIX).$(SUFFIX) icamax_k$(TSUFFIX).$(SUFFIX) icamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| casum_k$(TSUFFIX).$(SUFFIX) caxpy_k$(TSUFFIX).$(SUFFIX) caxpyc_k$(TSUFFIX).$(SUFFIX) ccopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| cdotc_k$(TSUFFIX).$(SUFFIX) cdotu_k$(TSUFFIX).$(SUFFIX) cnrm2_k$(TSUFFIX).$(SUFFIX) csrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) | |||
| cscal_k$(TSUFFIX).$(SUFFIX) cswap_k$(TSUFFIX).$(SUFFIX) caxpby_k$(TSUFFIX).$(SUFFIX) csum_k$(TSUFFIX).$(SUFFIX) | |||
| ZBLASOBJS += \ | |||
| zamax_k$(TSUFFIX).$(SUFFIX) zamin_k$(TSUFFIX).$(SUFFIX) izamax_k$(TSUFFIX).$(SUFFIX) izamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| zasum_k$(TSUFFIX).$(SUFFIX) zaxpy_k$(TSUFFIX).$(SUFFIX) zaxpyc_k$(TSUFFIX).$(SUFFIX) zcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| zdotc_k$(TSUFFIX).$(SUFFIX) zdotu_k$(TSUFFIX).$(SUFFIX) znrm2_k$(TSUFFIX).$(SUFFIX) zdrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) | |||
| zscal_k$(TSUFFIX).$(SUFFIX) zswap_k$(TSUFFIX).$(SUFFIX) zaxpby_k$(TSUFFIX).$(SUFFIX) zsum_k$(TSUFFIX).$(SUFFIX) | |||
| XBLASOBJS += \ | |||
| xamax_k$(TSUFFIX).$(SUFFIX) xamin_k$(TSUFFIX).$(SUFFIX) ixamax_k$(TSUFFIX).$(SUFFIX) ixamin_k$(TSUFFIX).$(SUFFIX) \ | |||
| xasum_k$(TSUFFIX).$(SUFFIX) xaxpy_k$(TSUFFIX).$(SUFFIX) xaxpyc_k$(TSUFFIX).$(SUFFIX) xcopy_k$(TSUFFIX).$(SUFFIX) \ | |||
| xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ | |||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) | |||
| xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) | |||
| ### AMAX ### | |||
| @@ -617,7 +644,7 @@ $(KDIR)idmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)idmin_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
| $(KDIR)iqmin_k$(TSUFFIX).$(SUFFIX) $(KDIR)iqmin_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(IQMINKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE -UUSE_ABS -DUSE_MIN $< -o $@ | |||
| ### ASUM ### | |||
| $(KDIR)sasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)sasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SASUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| @@ -636,6 +663,26 @@ $(KDIR)zasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zasum_k$(TPSUFFIX).$(PSUFFIX) : $(KE | |||
| $(KDIR)xasum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xasum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XASUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
| ### SUM ### | |||
| $(KDIR)ssum_k$(TSUFFIX).$(SUFFIX) $(KDIR)ssum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)dsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)dsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)qsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)qsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ | |||
| $(KDIR)csum_k$(TSUFFIX).$(SUFFIX) $(KDIR)csum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -UDOUBLE $< -o $@ | |||
| $(KDIR)zsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)zsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DDOUBLE $< -o $@ | |||
| $(KDIR)xsum_k$(TSUFFIX).$(SUFFIX) $(KDIR)xsum_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XSUMKERNEL) | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE $< -o $@ | |||
| ### AXPY ### | |||
| $(KDIR)saxpy_k$(TSUFFIX).$(SUFFIX) $(KDIR)saxpy_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SAXPYKERNEL) | |||
| $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ | |||
| @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(TARGET), GENERIC) | |||
| ifeq ($(CORE), GENERIC) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -44,10 +44,18 @@ ifeq ($(CORE), POWER8) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -0,0 +1,206 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "version.h" | |||
| #define PREFETCHSIZE 88 | |||
| #define N $16 | |||
| #define X $17 | |||
| #define INCX $18 | |||
| #define I $19 | |||
| #define s0 $f0 | |||
| #define s1 $f1 | |||
| #define s2 $f10 | |||
| #define s3 $f11 | |||
| #define a0 $f12 | |||
| #define a1 $f13 | |||
| #define a2 $f14 | |||
| #define a3 $f15 | |||
| #define a4 $f16 | |||
| #define a5 $f17 | |||
| #define a6 $f18 | |||
| #define a7 $f19 | |||
| #define t0 $f20 | |||
| #define t1 $f21 | |||
| #define t2 $f22 | |||
| #define t3 $f23 | |||
| PROLOGUE | |||
| PROFCODE | |||
| fclr s0 | |||
| unop | |||
| fclr t0 | |||
| ble N, $L999 | |||
| sra N, 3, I | |||
| fclr s1 | |||
| fclr s2 | |||
| ble I, $L15 | |||
| LD a0, 0 * SIZE(X) | |||
| fclr t1 | |||
| SXADDQ INCX, X, X | |||
| fclr t2 | |||
| LD a1, 0 * SIZE(X) | |||
| fclr t3 | |||
| SXADDQ INCX, X, X | |||
| fclr s3 | |||
| LD a2, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a3, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a4, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a5, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| lda I, -1(I) | |||
| ble I, $L13 | |||
| .align 4 | |||
| $L12: | |||
| ADD s0, t0, s0 | |||
| ldl $31, PREFETCHSIZE * 2 * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| LD a7, 0 * SIZE(X) | |||
| fmov a2, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a3, t3 | |||
| SXADDQ INCX, X, X | |||
| ADD s0, t0, s0 | |||
| LD a1, 0 * SIZE(X) | |||
| fmov a4, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a2, 0 * SIZE(X) | |||
| fmov a5, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| LD a3, 0 * SIZE(X) | |||
| fmov a6, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a4, 0 * SIZE(X) | |||
| fmov a7, t3 | |||
| SXADDQ INCX, X, X | |||
| LD a5, 0 * SIZE(X) | |||
| unop | |||
| SXADDQ INCX, X, X | |||
| bne I, $L12 | |||
| .align 4 | |||
| $L13: | |||
| ADD s0, t0, s0 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a7, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| fmov a2, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a3, t3 | |||
| ADD s0, t0, s0 | |||
| fmov a4, t0 | |||
| ADD s1, t1, s1 | |||
| fmov a5, t1 | |||
| ADD s2, t2, s2 | |||
| fmov a6, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a7, t3 | |||
| ADD s1, t1, s1 | |||
| ADD s2, t2, s2 | |||
| ADD s3, t3, s3 | |||
| ADD s0, s1, s0 | |||
| ADD s2, s3, s2 | |||
| .align 4 | |||
| $L15: | |||
| and N, 7, I | |||
| ADD s0, s2, s0 | |||
| unop | |||
| ble I, $L999 | |||
| .align 4 | |||
| $L17: | |||
| ADD s0, t0, s0 | |||
| LD a0, 0 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| bne I, $L17 | |||
| .align 4 | |||
| $L999: | |||
| ADD s0, t0, s0 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,208 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "version.h" | |||
| #define PREFETCHSIZE 88 | |||
| #define N $16 | |||
| #define X $17 | |||
| #define INCX $18 | |||
| #define I $19 | |||
| #define s0 $f0 | |||
| #define s1 $f1 | |||
| #define s2 $f10 | |||
| #define s3 $f11 | |||
| #define a0 $f12 | |||
| #define a1 $f13 | |||
| #define a2 $f14 | |||
| #define a3 $f15 | |||
| #define a4 $f16 | |||
| #define a5 $f17 | |||
| #define a6 $f18 | |||
| #define a7 $f19 | |||
| #define t0 $f20 | |||
| #define t1 $f21 | |||
| #define t2 $f22 | |||
| #define t3 $f23 | |||
| PROLOGUE | |||
| PROFCODE | |||
| fclr s0 | |||
| unop | |||
| fclr t0 | |||
| addq INCX, INCX, INCX | |||
| fclr s1 | |||
| unop | |||
| fclr t1 | |||
| ble N, $L999 | |||
| fclr s2 | |||
| sra N, 2, I | |||
| fclr s3 | |||
| ble I, $L15 | |||
| LD a0, 0 * SIZE(X) | |||
| fclr t2 | |||
| LD a1, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a2, 0 * SIZE(X) | |||
| fclr t3 | |||
| LD a3, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| LD a4, 0 * SIZE(X) | |||
| LD a5, 1 * SIZE(X) | |||
| SXADDQ INCX, X, X | |||
| lda I, -1(I) | |||
| ble I, $L13 | |||
| .align 4 | |||
| $L12: | |||
| ADD s0, t0, s0 | |||
| ldl $31, PREFETCHSIZE * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a1, t1 | |||
| unop | |||
| ADD s2, t2, s2 | |||
| LD a7, 1 * SIZE(X) | |||
| fmov a2, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a3, t3 | |||
| unop | |||
| ADD s0, t0, s0 | |||
| LD a1, 1 * SIZE(X) | |||
| fmov a4, t0 | |||
| SXADDQ INCX, X, X | |||
| ADD s1, t1, s1 | |||
| LD a2, 0 * SIZE(X) | |||
| fmov a5, t1 | |||
| unop | |||
| ADD s2, t2, s2 | |||
| LD a3, 1 * SIZE(X) | |||
| fmov a6, t2 | |||
| SXADDQ INCX, X, X | |||
| ADD s3, t3, s3 | |||
| LD a4, 0 * SIZE(X) | |||
| fmov a7, t3 | |||
| unop | |||
| LD a5, 1 * SIZE(X) | |||
| unop | |||
| SXADDQ INCX, X, X | |||
| bne I, $L12 | |||
| .align 4 | |||
| $L13: | |||
| ADD s0, t0, s0 | |||
| LD a6, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| ADD s1, t1, s1 | |||
| LD a7, 1 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| ADD s2, t2, s2 | |||
| fmov a2, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a3, t3 | |||
| ADD s0, t0, s0 | |||
| fmov a4, t0 | |||
| ADD s1, t1, s1 | |||
| fmov a5, t1 | |||
| ADD s2, t2, s2 | |||
| fmov a6, t2 | |||
| ADD s3, t3, s3 | |||
| fmov a7, t3 | |||
| ADD s2, t2, s2 | |||
| ADD s3, t3, s3 | |||
| .align 4 | |||
| $L15: | |||
| ADD s0, s2, s0 | |||
| and N, 3, I | |||
| ADD s1, s3, s1 | |||
| ble I, $L999 | |||
| .align 4 | |||
| $L17: | |||
| ADD s0, t0, s0 | |||
| LD a0, 0 * SIZE(X) | |||
| fmov a0, t0 | |||
| lda I, -1(I) | |||
| ADD s1, t1, s1 | |||
| LD a1, 1 * SIZE(X) | |||
| fmov a1, t1 | |||
| SXADDQ INCX, X, X | |||
| bne I, $L17 | |||
| .align 4 | |||
| $L999: | |||
| ADD s0, t0, s0 | |||
| ADD s1, t1, s1 | |||
| ADD s0, s1, s0 | |||
| ret | |||
| EPILOGUE | |||
| @@ -35,6 +35,11 @@ DASUMKERNEL = ../arm/asum.c | |||
| CASUMKERNEL = ../arm/zasum.c | |||
| ZASUMKERNEL = ../arm/zasum.c | |||
| SSUMKERNEL = ../arm/sum.c | |||
| DSUMKERNEL = ../arm/sum.c | |||
| CSUMKERNEL = ../arm/zsum.c | |||
| ZSUMKERNEL = ../arm/zsum.c | |||
| SAXPYKERNEL = ../arm/axpy.c | |||
| DAXPYKERNEL = ../arm/axpy.c | |||
| CAXPYKERNEL = ../arm/zaxpy.c | |||
| @@ -37,6 +37,9 @@ DASUMKERNEL = asum_vfp.S | |||
| CASUMKERNEL = asum_vfp.S | |||
| ZASUMKERNEL = asum_vfp.S | |||
| SSUMKERNEL = sum_vfp.S | |||
| DSUMKERNEL = sum_vfp.S | |||
| SAXPYKERNEL = axpy_vfp.S | |||
| DAXPYKERNEL = axpy_vfp.S | |||
| CAXPYKERNEL = axpy_vfp.S | |||
| @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| while(i < n) | |||
| { | |||
| if( x[ix] > minf ) | |||
| if( x[ix] < minf ) | |||
| { | |||
| min = i; | |||
| minf = x[ix]; | |||
| @@ -0,0 +1,51 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of asum.c with the ABS() removed * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,425 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of asum_vfp.S with the in-place vabs.f64 calls removed * | |||
| **************************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define STACKSIZE 256 | |||
| #define N r0 | |||
| #define X r1 | |||
| #define INC_X r2 | |||
| #define I r12 | |||
| #define X_PRE 512 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| #if !defined(COMPLEX) | |||
| #if defined(DOUBLE) | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f64 X, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| add X, X, INC_X | |||
| .endm | |||
| #else | |||
| .macro KERNEL_F4 | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f32 X, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| add X, X, INC_X | |||
| .endm | |||
| #endif | |||
| #else | |||
| #if defined(DOUBLE) | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f64 X!, { d4 - d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d6 - d7 } | |||
| vadd.f64 d1 , d1, d5 | |||
| vadd.f64 d0 , d0, d6 | |||
| vadd.f64 d1 , d1, d7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vldmia.f64 X!, { d4 } | |||
| vadd.f64 d0 , d0, d4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f64 X, { d4 -d5 } | |||
| vadd.f64 d0 , d0, d4 | |||
| vadd.f64 d0 , d0, d5 | |||
| add X, X, INC_X | |||
| .endm | |||
| #else | |||
| .macro KERNEL_F4 | |||
| pld [ X, #X_PRE ] | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| vldmia.f32 X!, { s4 - s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s6 - s7 } | |||
| vadd.f32 s1 , s1, s5 | |||
| vadd.f32 s0 , s0, s6 | |||
| vadd.f32 s1 , s1, s7 | |||
| .endm | |||
| .macro KERNEL_F1 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vldmia.f32 X!, { s4 } | |||
| vadd.f32 s0 , s0, s4 | |||
| .endm | |||
| .macro KERNEL_S4 | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| vldmia.f32 X, { s4 -s5 } | |||
| vadd.f32 s0 , s0, s4 | |||
| vadd.f32 s0 , s0, s5 | |||
| add X, X, INC_X | |||
| .endm | |||
| #endif | |||
| #endif | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| movs r12, #0 // clear floating point register | |||
| vmov s0, r12 | |||
| vmov s1, r12 | |||
| #if defined(DOUBLE) | |||
| vcvt.f64.f32 d0, s0 | |||
| vcvt.f64.f32 d1, s1 | |||
| #endif | |||
| cmp N, #0 | |||
| ble asum_kernel_L999 | |||
| cmp INC_X, #0 | |||
| beq asum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne asum_kernel_S_BEGIN | |||
| asum_kernel_F_BEGIN: | |||
| asrs I, N, #2 // I = N / 4 | |||
| ble asum_kernel_F1 | |||
| .align 5 | |||
| asum_kernel_F4: | |||
| #if !defined(DOUBLE) && !defined(COMPLEX) | |||
| pld [ X, #X_PRE ] | |||
| #endif | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| ble asum_kernel_F1 | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F4 | |||
| asum_kernel_F1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| asum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_F10 | |||
| b asum_kernel_L999 | |||
| asum_kernel_S_BEGIN: | |||
| #if defined(COMPLEX) | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #4 // INC_X * SIZE * 2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE * 2 | |||
| #endif | |||
| #else | |||
| #if defined(DOUBLE) | |||
| lsl INC_X, INC_X, #3 // INC_X * SIZE | |||
| #else | |||
| lsl INC_X, INC_X, #2 // INC_X * SIZE | |||
| #endif | |||
| #endif | |||
| asrs I, N, #2 // I = N / 4 | |||
| ble asum_kernel_S1 | |||
| .align 5 | |||
| asum_kernel_S4: | |||
| KERNEL_S4 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S4 | |||
| asum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble asum_kernel_L999 | |||
| asum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne asum_kernel_S10 | |||
| asum_kernel_L999: | |||
| #if defined(DOUBLE) | |||
| vadd.f64 d0 , d0, d1 // set return value | |||
| #else | |||
| vadd.f32 s0 , s0, s1 // set return value | |||
| #endif | |||
| #if !defined(__ARM_PCS_VFP) | |||
| #if !defined(DOUBLE) | |||
| vmov r0, s0 | |||
| #else | |||
| vmov r0, r1, d0 | |||
| #endif | |||
| #endif | |||
| bx lr | |||
| EPILOGUE | |||
| @@ -0,0 +1,57 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| /************************************************************************************** | |||
| * trivial copy of zasum.c with the ABS() removed * | |||
| **************************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #define CSUM1(x,i) x[i]+x[i+1] | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| inc_x2 = 2 * inc_x; | |||
| n *= inc_x2; | |||
| while(i < n) | |||
| { | |||
| sumf += CSUM1(x,i); | |||
| i += inc_x2; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,175 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRMMKERNEL = ../generic/trmmkernel_4x4.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot.S | |||
| DDOTKERNEL = dot.S | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,164 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #define REG0 wzr | |||
| #define SUMF s0 | |||
| #define TMPF s1 | |||
| #define TMPVF {v1.s}[0] | |||
| #define SZ 4 | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ld1 {v1.2s}, [X], #8 | |||
| ext v2.8b, v1.8b, v1.8b, #4 | |||
| fadd TMPF, TMPF, s2 | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| ld1 {v1.4s, v2.4s, v3.4s, v4.4s}, [X] | |||
| add X, X, #64 | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| fadd v1.4s, v1.4s, v2.4s | |||
| fadd v3.4s, v3.4s, v4.4s | |||
| fadd v0.4s, v0.4s, v1.4s | |||
| fadd v0.4s, v0.4s, v3.4s | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| ext v1.16b, v0.16b, v0.16b, #8 | |||
| fadd v0.2s, v0.2s, v1.2s | |||
| faddp SUMF, v0.2s | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #3 | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 {v1.2s}, [X], INC_X | |||
| ext v2.8b, v1.8b, v1.8b, #4 | |||
| fadd TMPF, TMPF, s2 | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| fmov s1, SUMF | |||
| cmp N, xzr | |||
| ble .Lcsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lcsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lcsum_kernel_S_BEGIN | |||
| .Lcsum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq .Lcsum_kernel_F1 | |||
| .Lcsum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| .Lcsum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble .Lcsum_kernel_L999 | |||
| .Lcsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_F10 | |||
| .Lcsum_kernel_L999: | |||
| ret | |||
| .Lcsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lcsum_kernel_S1 | |||
| .Lcsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_S4 | |||
| .Lcsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lcsum_kernel_L999 | |||
| .Lcsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lcsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,186 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #if !defined(DOUBLE) | |||
| #define REG0 wzr | |||
| #define SUMF s0 | |||
| #define TMPF s1 | |||
| #define TMPVF {v1.s}[0] | |||
| #define SZ 4 | |||
| #else | |||
| #define REG0 xzr | |||
| #define SUMF d0 | |||
| #define TMPF d1 | |||
| #define TMPVF {v1.d}[0] | |||
| #define SZ 8 | |||
| #endif | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ldr TMPF, [X], #SZ | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F8 | |||
| #if !defined(DOUBLE) | |||
| ld1 {v1.4s, v2.4s}, [X], #32 // Load [X3, X2, X1, X0] | |||
| fadd v1.4s, v1.4s, v2.4s // [X3+X1, X2+X0] | |||
| fadd v0.4s, v0.4s, v1.4s // [X3+X1, X2+X0] | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| #else // DOUBLE | |||
| ld1 {v2.2d, v3.2d, v4.2d, v5.2d}, [X] | |||
| add X, X, #64 | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| fadd v2.2d, v2.2d, v3.2d | |||
| fadd v4.2d, v4.2d, v5.2d | |||
| fadd v0.2d, v0.2d, v2.2d | |||
| fadd v0.2d, v0.2d, v4.2d | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_F8_FINALIZE | |||
| #if !defined(DOUBLE) | |||
| ext v1.16b, v0.16b, v0.16b, #8 | |||
| fadd v0.2s, v0.2s, v1.2s | |||
| faddp SUMF, v0.2s | |||
| #else | |||
| faddp SUMF, v0.2d | |||
| #endif | |||
| .endm | |||
| .macro INIT_S | |||
| #if !defined(DOUBLE) | |||
| lsl INC_X, INC_X, #2 | |||
| #else | |||
| lsl INC_X, INC_X, #3 | |||
| #endif | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 TMPVF, [X], INC_X | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| #if !defined(DOUBLE) | |||
| fmov s1, SUMF | |||
| #else | |||
| fmov d1, SUMF | |||
| #endif | |||
| cmp N, xzr | |||
| ble .Lsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lsum_kernel_S_BEGIN | |||
| .Lsum_kernel_F_BEGIN: | |||
| asr I, N, #3 | |||
| cmp I, xzr | |||
| beq .Lsum_kernel_F1 | |||
| .Lsum_kernel_F8: | |||
| KERNEL_F8 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_F8 | |||
| KERNEL_F8_FINALIZE | |||
| .Lsum_kernel_F1: | |||
| ands I, N, #7 | |||
| ble .Lsum_kernel_L999 | |||
| .Lsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_F10 | |||
| .Lsum_kernel_L999: | |||
| ret | |||
| .Lsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lsum_kernel_S1 | |||
| .Lsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_S4 | |||
| .Lsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lsum_kernel_L999 | |||
| .Lsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,158 @@ | |||
| /******************************************************************************* | |||
| Copyright (c) 2015, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *******************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N x0 /* vector length */ | |||
| #define X x1 /* X vector address */ | |||
| #define INC_X x2 /* X stride */ | |||
| #define I x5 /* loop variable */ | |||
| /******************************************************************************* | |||
| * Macro definitions | |||
| *******************************************************************************/ | |||
| #define REG0 xzr | |||
| #define SUMF d0 | |||
| #define TMPF d1 | |||
| #define TMPVF {v1.d}[0] | |||
| #define SZ 8 | |||
| /******************************************************************************/ | |||
| .macro KERNEL_F1 | |||
| ld1 {v1.2d}, [X], #16 | |||
| faddp TMPF, v1.2d | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| .macro KERNEL_F4 | |||
| ld1 {v1.2d, v2.2d, v3.2d, v4.2d}, [X], #64 | |||
| fadd v1.2d, v1.2d, v2.2d | |||
| fadd v3.2d, v3.2d, v4.2d | |||
| fadd v0.2d, v0.2d, v1.2d | |||
| fadd v0.2d, v0.2d, v3.2d | |||
| PRFM PLDL1KEEP, [X, #1024] | |||
| .endm | |||
| .macro KERNEL_F4_FINALIZE | |||
| faddp SUMF, v0.2d | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #4 | |||
| .endm | |||
| .macro KERNEL_S1 | |||
| ld1 {v1.2d}, [X], INC_X | |||
| faddp TMPF, v1.2d | |||
| fadd SUMF, SUMF, TMPF | |||
| .endm | |||
| /******************************************************************************* | |||
| * End of macro definitions | |||
| *******************************************************************************/ | |||
| PROLOGUE | |||
| fmov SUMF, REG0 | |||
| cmp N, xzr | |||
| ble .Lzsum_kernel_L999 | |||
| cmp INC_X, xzr | |||
| ble .Lzsum_kernel_L999 | |||
| cmp INC_X, #1 | |||
| bne .Lzsum_kernel_S_BEGIN | |||
| .Lzsum_kernel_F_BEGIN: | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| beq .Lzsum_kernel_F1 | |||
| .Lzsum_kernel_F4: | |||
| KERNEL_F4 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_F4 | |||
| KERNEL_F4_FINALIZE | |||
| .Lzsum_kernel_F1: | |||
| ands I, N, #3 | |||
| ble .Lzsum_kernel_L999 | |||
| .Lzsum_kernel_F10: | |||
| KERNEL_F1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_F10 | |||
| .Lzsum_kernel_L999: | |||
| ret | |||
| .Lzsum_kernel_S_BEGIN: | |||
| INIT_S | |||
| asr I, N, #2 | |||
| cmp I, xzr | |||
| ble .Lzsum_kernel_S1 | |||
| .Lzsum_kernel_S4: | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_S4 | |||
| .Lzsum_kernel_S1: | |||
| ands I, N, #3 | |||
| ble .Lzsum_kernel_L999 | |||
| .Lzsum_kernel_S10: | |||
| KERNEL_S1 | |||
| subs I, I, #1 | |||
| bne .Lzsum_kernel_S10 | |||
| ret | |||
| EPILOGUE | |||
| @@ -60,6 +60,10 @@ CASUMKERNEL = asum.S | |||
| ZASUMKERNEL = asum.S | |||
| XASUMKERNEL = asum.S | |||
| CSUMKERNEL = sum.S | |||
| ZSUMKERNEL = sum.S | |||
| XSUMKERNEL = sum.S | |||
| CNRM2KERNEL = nrm2.S | |||
| ZNRM2KERNEL = nrm2.S | |||
| XNRM2KERNEL = nrm2.S | |||
| @@ -0,0 +1,358 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* Copyright 2019, The OpenBLAS project */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #ifdef XDOUBLE | |||
| #define PREFETCH_SIZE ( 8 * 16 + 4) | |||
| #elif defined(DOUBLE) | |||
| #define PREFETCH_SIZE (16 * 16 + 8) | |||
| #else | |||
| #define PREFETCH_SIZE (32 * 16 + 16) | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define COMPADD 0 | |||
| #define STRIDE INCX | |||
| #else | |||
| #define COMPADD 1 | |||
| #define STRIDE SIZE | |||
| #endif | |||
| #define PRE1 r2 | |||
| #define I r17 | |||
| #define J r18 | |||
| #define INCX16 r21 | |||
| #define PR r30 | |||
| #define ARLC r31 | |||
| #define N r32 | |||
| #define X r33 | |||
| #define INCX r34 | |||
| PROLOGUE | |||
| .prologue | |||
| PROFCODE | |||
| { .mfi | |||
| adds PRE1 = PREFETCH_SIZE * SIZE, X | |||
| mov f8 = f0 | |||
| .save ar.lc, ARLC | |||
| mov ARLC = ar.lc | |||
| } | |||
| ;; | |||
| .body | |||
| #ifdef F_INTERFACE | |||
| { .mmi | |||
| LDINT N = [N] | |||
| LDINT INCX = [INCX] | |||
| nop.i 0 | |||
| } | |||
| ;; | |||
| #ifndef USE64BITINT | |||
| { .mii | |||
| nop.m 0 | |||
| sxt4 N = N | |||
| sxt4 INCX = INCX | |||
| } | |||
| ;; | |||
| #endif | |||
| #endif | |||
| { .mmi | |||
| cmp.lt p0, p6 = r0, INCX | |||
| cmp.lt p0, p7 = r0, N | |||
| shr I = N, (4 - COMPADD) | |||
| } | |||
| { .mbb | |||
| and J = ((1 << (4 - COMPADD)) - 1), N | |||
| (p6) br.ret.sptk.many b0 | |||
| (p7) br.ret.sptk.many b0 | |||
| } | |||
| ;; | |||
| { .mfi | |||
| adds I = -1, I | |||
| mov f10 = f0 | |||
| mov PR = pr | |||
| } | |||
| { .mfi | |||
| cmp.eq p9, p0 = r0, J | |||
| mov f9 = f0 | |||
| tbit.z p0, p12 = N, 3 - COMPADD | |||
| } | |||
| ;; | |||
| { .mmi | |||
| cmp.eq p16, p0 = r0, r0 | |||
| cmp.ne p17, p0 = r0, r0 | |||
| mov ar.ec= 3 | |||
| } | |||
| { .mfi | |||
| cmp.ne p18, p0 = r0, r0 | |||
| mov f11 = f0 | |||
| shl INCX = INCX, BASE_SHIFT + COMPADD | |||
| } | |||
| ;; | |||
| { .mmi | |||
| #ifdef XDOUBLE | |||
| shladd INCX16 = INCX, (3 - COMPADD), r0 | |||
| #else | |||
| shladd INCX16 = INCX, (4 - COMPADD), r0 | |||
| #endif | |||
| cmp.ne p19, p0 = r0, r0 | |||
| mov ar.lc = I | |||
| } | |||
| { .mmb | |||
| cmp.gt p8 ,p0 = r0, I | |||
| #ifdef COMPLEX | |||
| adds INCX = - SIZE, INCX | |||
| #else | |||
| nop.m 0 | |||
| #endif | |||
| (p8) br.cond.dpnt .L55 | |||
| } | |||
| ;; | |||
| .align 32 | |||
| .L52: | |||
| { .mmf | |||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||
| (p16) LDFD f32 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p19) FADD f8 = f8, f71 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f35 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p19) FADD f9 = f9, f74 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f38 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p19) FADD f10 = f10, f77 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f41 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p19) FADD f11 = f11, f80 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f44 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f34 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f47 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f37 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f50 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f40 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f53 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f43 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| #ifdef XDOUBLE | |||
| (p16) lfetch.nt1 [PRE1], INCX16 | |||
| #endif | |||
| (p16) LDFD f56 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f46 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f59 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f49 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f62 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f52 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f65 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f55 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f68 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f8 = f8, f58 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f71 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f9 = f9, f61 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f74 = [X], STRIDE | |||
| } | |||
| { .mfb | |||
| (p18) FADD f10 = f10, f64 | |||
| } | |||
| ;; | |||
| { .mmf | |||
| (p16) LDFD f77 = [X], INCX | |||
| } | |||
| { .mfb | |||
| (p18) FADD f11 = f11, f67 | |||
| br.ctop.sptk.few .L52 | |||
| } | |||
| ;; | |||
| FADD f8 = f8, f71 | |||
| FADD f9 = f9, f74 | |||
| FADD f10 = f10, f77 | |||
| FADD f11 = f11, f80 | |||
| .align 32 | |||
| ;; | |||
| .L55: | |||
| (p12) LDFD f32 = [X], STRIDE | |||
| (p9) br.cond.dptk .L998 | |||
| ;; | |||
| (p12) LDFD f33 = [X], INCX | |||
| ;; | |||
| (p12) LDFD f34 = [X], STRIDE | |||
| ;; | |||
| (p12) LDFD f35 = [X], INCX | |||
| tbit.z p0, p13 = N, (2 - COMPADD) | |||
| ;; | |||
| (p12) LDFD f36 = [X], STRIDE | |||
| tbit.z p0, p14 = N, (1 - COMPADD) | |||
| ;; | |||
| (p12) LDFD f37 = [X], INCX | |||
| #ifndef COMPLEX | |||
| tbit.z p0, p15 = N, 0 | |||
| #endif | |||
| ;; | |||
| (p12) LDFD f38 = [X], STRIDE | |||
| ;; | |||
| (p12) LDFD f39 = [X], INCX | |||
| ;; | |||
| (p13) LDFD f40 = [X], STRIDE | |||
| ;; | |||
| (p13) LDFD f41 = [X], INCX | |||
| ;; | |||
| (p13) LDFD f42 = [X], STRIDE | |||
| (p12) FADD f8 = f8, f32 | |||
| ;; | |||
| (p13) LDFD f43 = [X], INCX | |||
| (p12) FADD f9 = f9, f33 | |||
| ;; | |||
| (p14) LDFD f44 = [X], STRIDE | |||
| (p12) FADD f10 = f10, f34 | |||
| ;; | |||
| (p14) LDFD f45 = [X], INCX | |||
| (p12) FADD f11 = f11, f35 | |||
| ;; | |||
| #ifndef COMPLEX | |||
| (p15) LDFD f46 = [X] | |||
| #endif | |||
| (p12) FADD f8 = f8, f36 | |||
| ;; | |||
| (p12) FADD f9 = f9, f37 | |||
| (p12) FADD f10 = f10, f38 | |||
| (p12) FADD f11 = f11, f39 | |||
| ;; | |||
| (p13) FADD f8 = f8, f40 | |||
| (p13) FADD f9 = f9, f41 | |||
| #ifndef COMPLEX | |||
| #endif | |||
| (p13) FADD f10 = f10, f42 | |||
| ;; | |||
| (p13) FADD f11 = f11, f43 | |||
| (p14) FADD f8 = f8, f44 | |||
| (p14) FADD f9 = f9, f45 | |||
| #ifndef COMPLEX | |||
| (p15) FADD f10 = f10, f46 | |||
| #endif | |||
| ;; | |||
| .align 32 | |||
| .L998: | |||
| { .mfi | |||
| FADD f8 = f8, f9 | |||
| mov ar.lc = ARLC | |||
| } | |||
| { .mmf | |||
| FADD f10 = f10, f11 | |||
| } | |||
| ;; | |||
| { .mii | |||
| mov pr = PR, -65474 | |||
| } | |||
| ;; | |||
| { .mfb | |||
| FADD f8 = f8, f10 | |||
| br.ret.sptk.many b0 | |||
| } | |||
| EPILOGUE | |||
| @@ -30,6 +30,11 @@ IDMAXKERNEL = ../mips/imax.c | |||
| ISMINKERNEL = ../mips/imin.c | |||
| IDMINKERNEL = ../mips/imin.c | |||
| SSUMKERNEL = ../mips/sum.c | |||
| DSUMKERNEL = ../mips/sum.c | |||
| CSUMKERNEL = ../mips/zsum.c | |||
| ZSUMKERNEL = ../mips/zsum.c | |||
| ifdef HAVE_MSA | |||
| SASUMKERNEL = ../mips/sasum_msa.c | |||
| DASUMKERNEL = ../mips/dasum_msa.c | |||
| @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| while(i < n) | |||
| { | |||
| if( x[ix] > minf ) | |||
| if( x[ix] < minf ) | |||
| { | |||
| min = i; | |||
| minf = x[ix]; | |||
| @@ -0,0 +1,47 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| n *= inc_x; | |||
| while(i < n) | |||
| { | |||
| sumf += x[i]; | |||
| i += inc_x; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,52 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #include <math.h> | |||
| #define CSUM1(x,i) x[i]+x[i+1] | |||
| FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| { | |||
| BLASLONG i=0; | |||
| FLOAT sumf = 0.0; | |||
| BLASLONG inc_x2; | |||
| if (n <= 0 || inc_x <= 0) return(sumf); | |||
| inc_x2 = 2 * inc_x; | |||
| n *= inc_x2; | |||
| while(i < n) | |||
| { | |||
| sumf += CSUM1(x,i); | |||
| i += inc_x2; | |||
| } | |||
| return(sumf); | |||
| } | |||
| @@ -0,0 +1,332 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N $4 | |||
| #define X $5 | |||
| #define INCX $6 | |||
| #define I $2 | |||
| #define TEMP $3 | |||
| #define a1 $f2 | |||
| #define a2 $f3 | |||
| #define a3 $f4 | |||
| #define a4 $f5 | |||
| #define a5 $f6 | |||
| #define a6 $f7 | |||
| #define a7 $f8 | |||
| #define a8 $f9 | |||
| #define t1 $f10 | |||
| #define t2 $f11 | |||
| #define t3 $f12 | |||
| #define t4 $f13 | |||
| #define s1 $f0 | |||
| #define s2 $f1 | |||
| PROLOGUE | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| MTC $0, s1 | |||
| MTC $0, s2 | |||
| dsll INCX, INCX, BASE_SHIFT | |||
| blez N, .L999 | |||
| li TEMP, SIZE | |||
| bne INCX, TEMP, .L20 | |||
| dsra I, N, 3 | |||
| blez I, .L15 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| LD a3, 2 * SIZE(X) | |||
| LD a4, 3 * SIZE(X) | |||
| LD a5, 4 * SIZE(X) | |||
| MOV t1, a1 | |||
| LD a6, 5 * SIZE(X) | |||
| MOV t2, a2 | |||
| LD a7, 6 * SIZE(X) | |||
| MOV t3, a3 | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L13 | |||
| LD a8, 7 * SIZE(X) | |||
| .align 3 | |||
| .L12: | |||
| ADD s1, s1, t1 | |||
| LD a1, 8 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddiu I, I, -1 | |||
| ADD s2, s2, t2 | |||
| LD a2, 9 * SIZE(X) | |||
| MOV t2, a6 | |||
| NOP | |||
| ADD s1, s1, t3 | |||
| LD a3, 10 * SIZE(X) | |||
| MOV t3, a7 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a4, 11 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddiu X, X, 8 * SIZE | |||
| ADD s1, s1, t1 | |||
| LD a5, 4 * SIZE(X) | |||
| MOV t1, a1 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| LD a6, 5 * SIZE(X) | |||
| MOV t2, a2 | |||
| NOP | |||
| ADD s1, s1, t3 | |||
| LD a7, 6 * SIZE(X) | |||
| MOV t3, a3 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a8, 7 * SIZE(X) | |||
| bgtz I, .L12 | |||
| MOV t4, a4 | |||
| .align 3 | |||
| .L13: | |||
| ADD s1, s1, t1 | |||
| daddiu X, X, 8 * SIZE | |||
| MOV t1, a5 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L15: | |||
| andi I, N, 7 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L16: | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu I, I, -1 | |||
| MOV t1, a1 | |||
| ADD s1, s1, t1 | |||
| bgtz I, .L16 | |||
| daddiu X, X, SIZE | |||
| j .L999 | |||
| NOP | |||
| .align 3 | |||
| .L20: | |||
| blez I, .L25 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a2, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a3, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a4, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a5, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a6, 0 * SIZE(X) | |||
| daddu X, X, INCX | |||
| MOV t1, a1 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| MOV t3, a3 | |||
| LD a8, 0 * SIZE(X) | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L24 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L23: | |||
| ADD s1, s1, t1 | |||
| LD a1, 0 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t2 | |||
| LD a2, 0 * SIZE(X) | |||
| MOV t2, a6 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a3, 0 * SIZE(X) | |||
| MOV t3, a7 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t4 | |||
| LD a4, 0 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| LD a5, 0 * SIZE(X) | |||
| MOV t1, a1 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t2 | |||
| LD a6, 0 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t3, a3 | |||
| daddu X, X, INCX | |||
| ADD s2, s2, t4 | |||
| LD a8, 0 * SIZE(X) | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| bgtz I, .L23 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L24: | |||
| ADD s1, s1, t1 | |||
| MOV t1, a5 | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L25: | |||
| andi I, N, 7 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L26: | |||
| LD a1, 0 * SIZE(X) | |||
| daddiu I, I, -1 | |||
| MOV t1, a1 | |||
| daddu X, X, INCX | |||
| bgtz I, .L26 | |||
| ADD s1, s1, t1 | |||
| .align 3 | |||
| .L999: | |||
| j $31 | |||
| ADD s1, s1, s2 | |||
| EPILOGUE | |||
| @@ -0,0 +1,204 @@ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define N $4 | |||
| #define X $5 | |||
| #define INCX $6 | |||
| #define I $2 | |||
| #define TEMP $3 | |||
| #define a1 $f2 | |||
| #define a2 $f3 | |||
| #define a3 $f4 | |||
| #define a4 $f5 | |||
| #define a5 $f6 | |||
| #define a6 $f7 | |||
| #define a7 $f8 | |||
| #define a8 $f9 | |||
| #define t1 $f10 | |||
| #define t2 $f11 | |||
| #define t3 $f12 | |||
| #define t4 $f13 | |||
| #define s1 $f0 | |||
| #define s2 $f1 | |||
| PROLOGUE | |||
| #ifdef F_INTERFACE | |||
| LDINT N, 0(N) | |||
| LDINT INCX, 0(INCX) | |||
| #endif | |||
| MTC $0, s1 | |||
| MTC $0, s2 | |||
| dsll INCX, INCX, ZBASE_SHIFT | |||
| blez N, .L999 | |||
| dsra I, N, 2 | |||
| blez I, .L25 | |||
| NOP | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a3, 0 * SIZE(X) | |||
| LD a4, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| LD a5, 0 * SIZE(X) | |||
| LD a6, 1 * SIZE(X) | |||
| daddu X, X, INCX | |||
| MOV t1, a1 | |||
| MOV t2, a2 | |||
| LD a7, 0 * SIZE(X) | |||
| LD a8, 1 * SIZE(X) | |||
| MOV t3, a3 | |||
| MOV t4, a4 | |||
| daddiu I, I, -1 | |||
| blez I, .L24 | |||
| daddu X, X, INCX | |||
| .align 3 | |||
| .L23: | |||
| ADD s1, s1, t1 | |||
| LD a1, 0 * SIZE(X) | |||
| MOV t1, a5 | |||
| daddiu I, I, -1 | |||
| ADD s2, s2, t2 | |||
| LD a2, 1 * SIZE(X) | |||
| MOV t2, a6 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a3, 0 * SIZE(X) | |||
| MOV t3, a7 | |||
| NOP | |||
| ADD s2, s2, t4 | |||
| LD a4, 1 * SIZE(X) | |||
| MOV t4, a8 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| LD a5, 0 * SIZE(X) | |||
| MOV t1, a1 | |||
| NOP | |||
| ADD s2, s2, t2 | |||
| LD a6, 1 * SIZE(X) | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t3 | |||
| LD a7, 0 * SIZE(X) | |||
| MOV t3, a3 | |||
| LD a8, 1 * SIZE(X) | |||
| ADD s2, s2, t4 | |||
| daddu X, X, INCX | |||
| bgtz I, .L23 | |||
| MOV t4, a4 | |||
| .align 3 | |||
| .L24: | |||
| ADD s1, s1, t1 | |||
| MOV t1, a5 | |||
| ADD s2, s2, t2 | |||
| MOV t2, a6 | |||
| ADD s1, s1, t3 | |||
| MOV t3, a7 | |||
| ADD s2, s2, t4 | |||
| MOV t4, a8 | |||
| ADD s1, s1, t1 | |||
| ADD s2, s2, t2 | |||
| ADD s1, s1, t3 | |||
| ADD s2, s2, t4 | |||
| .align 3 | |||
| .L25: | |||
| andi I, N, 3 | |||
| blez I, .L999 | |||
| NOP | |||
| .align 3 | |||
| .L26: | |||
| LD a1, 0 * SIZE(X) | |||
| LD a2, 1 * SIZE(X) | |||
| MOV t1, a1 | |||
| daddiu I, I, -1 | |||
| MOV t2, a2 | |||
| daddu X, X, INCX | |||
| ADD s1, s1, t1 | |||
| bgtz I, .L26 | |||
| ADD s2, s2, t2 | |||
| .align 3 | |||
| .L999: | |||
| j $31 | |||
| ADD s1, s1, s2 | |||
| EPILOGUE | |||
| @@ -13,40 +13,40 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy.o | |||
| SGEMMITCOPYOBJ = sgemm_itcopy.o | |||
| SGEMMONCOPYOBJ = sgemm_oncopy.o | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy.o | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_16x4_power8.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy.o | |||
| DGEMMITCOPYOBJ = dgemm_itcopy.o | |||
| DGEMMONCOPYOBJ = dgemm_oncopy.o | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy.o | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy.o | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy.o | |||
| CGEMMINCOPYOBJ = cgemm_incopy.o | |||
| CGEMMITCOPYOBJ = cgemm_itcopy.o | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy.o | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy.o | |||
| ZGEMMINCOPYOBJ = zgemm_incopy.o | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy.o | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -89,14 +89,14 @@ ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| #ISAMAXKERNEL = ../arm/iamax.c | |||
| ISAMAXKERNEL = isamax.c | |||
| IDAMAXKERNEL = idamax.c | |||
| #ICAMAXKERNEL = ../arm/izamax.c | |||
| IZAMAXKERNEL = izamax.c | |||
| ICAMAXKERNEL = icamax.c | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| #ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = idamin.c | |||
| #ICAMINKERNEL = ../arm/izamin.c | |||
| ISAMINKERNEL = isamin.c | |||
| IDAMINKERNEL = idamin.c | |||
| ICAMINKERNEL = icamin.c | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| @@ -110,9 +110,9 @@ DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| #SAXPYKERNEL = ../arm/axpy.c | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| #CAXPYKERNEL = ../arm/zaxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| @@ -123,7 +123,7 @@ ZCOPYKERNEL = zcopy.c | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| #CDOTKERNEL = ../arm/zdot.c | |||
| CDOTKERNEL = cdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| @@ -133,7 +133,7 @@ ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| CROTKERNEL = zrot.c | |||
| CROTKERNEL = crot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| @@ -147,14 +147,14 @@ CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| #SGEMVNKERNEL = ../arm/gemv_n.c | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| #CGEMVNKERNEL = ../arm/zgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| #SGEMVTKERNEL = ../arm/gemv_t.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| #CGEMVTKERNEL = ../arm/zgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| @@ -0,0 +1,184 @@ | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| STRMMKERNEL = strmm_kernel_16x8_power8.S | |||
| DTRMMKERNEL = dgemm_kernel_power9.S | |||
| CTRMMKERNEL = ctrmm_kernel_8x4_power8.S | |||
| ZTRMMKERNEL = ztrmm_kernel_8x2_power8.S | |||
| SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_power9.S | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_8x4_power8.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = cgemm_tcopy_8_power8.S | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_8x2_power8.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| #SAMAXKERNEL = ../arm/amax.c | |||
| #DAMAXKERNEL = ../arm/amax.c | |||
| #CAMAXKERNEL = ../arm/zamax.c | |||
| #ZAMAXKERNEL = ../arm/zamax.c | |||
| # | |||
| #SAMINKERNEL = ../arm/amin.c | |||
| #DAMINKERNEL = ../arm/amin.c | |||
| #CAMINKERNEL = ../arm/zamin.c | |||
| #ZAMINKERNEL = ../arm/zamin.c | |||
| # | |||
| #SMAXKERNEL = ../arm/max.c | |||
| #DMAXKERNEL = ../arm/max.c | |||
| # | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| ISAMAXKERNEL = isamax.c | |||
| IDAMAXKERNEL = idamax.c | |||
| ICAMAXKERNEL = icamax.c | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| ISAMINKERNEL = isamin.c | |||
| IDAMINKERNEL = idamin.c | |||
| ICAMINKERNEL = icamin.c | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| #IDMAXKERNEL = ../arm/imax.c | |||
| # | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| CAXPYKERNEL = caxpy.c | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| DCOPYKERNEL = dcopy.c | |||
| CCOPYKERNEL = ccopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| CDOTKERNEL = cdot.c | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| CROTKERNEL = crot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| DSCALKERNEL = dscal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| # | |||
| SSWAPKERNEL = sswap.c | |||
| DSWAPKERNEL = dswap.c | |||
| CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| @@ -0,0 +1,145 @@ | |||
| /* | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #ifndef HAVE_ASM_KERNEL | |||
| #include <altivec.h> | |||
| static void caxpy_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) | |||
| { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register __vector float valpha_r = {alpha_r, alpha_r,alpha_r, alpha_r}; | |||
| register __vector float valpha_i = {-alpha_i, alpha_i,-alpha_i, alpha_i}; | |||
| #else | |||
| register __vector float valpha_r = {alpha_r, -alpha_r,alpha_r, -alpha_r}; | |||
| register __vector float valpha_i = {alpha_i, alpha_i,alpha_i, alpha_i}; | |||
| #endif | |||
| __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| register __vector float *vy = (__vector float *) y; | |||
| register __vector float *vx = (__vector float *) x; | |||
| BLASLONG i=0; | |||
| for (; i < n/2; i += 8) { | |||
| register __vector float vy_0 = vy[i]; | |||
| register __vector float vy_1 = vy[i + 1]; | |||
| register __vector float vy_2 = vy[i + 2]; | |||
| register __vector float vy_3 = vy[i + 3]; | |||
| register __vector float vy_4 = vy[i + 4]; | |||
| register __vector float vy_5 = vy[i + 5]; | |||
| register __vector float vy_6 = vy[i + 6]; | |||
| register __vector float vy_7 = vy[i + 7]; | |||
| register __vector float vx_0 = vx[i]; | |||
| register __vector float vx_1 = vx[i + 1]; | |||
| register __vector float vx_2 = vx[i + 2]; | |||
| register __vector float vx_3 = vx[i + 3]; | |||
| register __vector float vx_4 = vx[i + 4]; | |||
| register __vector float vx_5 = vx[i + 5]; | |||
| register __vector float vx_6 = vx[i + 6]; | |||
| register __vector float vx_7 = vx[i + 7]; | |||
| vy_0 += vx_0*valpha_r; | |||
| vy_1 += vx_1*valpha_r; | |||
| vy_2 += vx_2*valpha_r; | |||
| vy_3 += vx_3*valpha_r; | |||
| vy_4 += vx_4*valpha_r; | |||
| vy_5 += vx_5*valpha_r; | |||
| vy_6 += vx_6*valpha_r; | |||
| vy_7 += vx_7*valpha_r; | |||
| vx_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
| vx_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
| vx_2 = vec_perm(vx_2, vx_2, swap_mask); | |||
| vx_3 = vec_perm(vx_3, vx_3, swap_mask); | |||
| vx_4 = vec_perm(vx_4, vx_4, swap_mask); | |||
| vx_5 = vec_perm(vx_5, vx_5, swap_mask); | |||
| vx_6 = vec_perm(vx_6, vx_6, swap_mask); | |||
| vx_7 = vec_perm(vx_7, vx_7, swap_mask); | |||
| vy_0 += vx_0*valpha_i; | |||
| vy_1 += vx_1*valpha_i; | |||
| vy_2 += vx_2*valpha_i; | |||
| vy_3 += vx_3*valpha_i; | |||
| vy_4 += vx_4*valpha_i; | |||
| vy_5 += vx_5*valpha_i; | |||
| vy_6 += vx_6*valpha_i; | |||
| vy_7 += vx_7*valpha_i; | |||
| vy[i] = vy_0; | |||
| vy[i + 1] = vy_1; | |||
| vy[i + 2] = vy_2; | |||
| vy[i + 3] = vy_3; | |||
| vy[i + 4] = vy_4; | |||
| vy[i + 5] = vy_5 ; | |||
| vy[i + 6] = vy_6 ; | |||
| vy[i + 7] = vy_7 ; | |||
| } | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix = 0, iy = 0; | |||
| if (n <= 0) return (0); | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| BLASLONG n1 = n & -16; | |||
| if (n1) { | |||
| caxpy_kernel_16(n1, x, y, da_r,da_i); | |||
| ix = 2 * n1; | |||
| } | |||
| i = n1; | |||
| while (i < n) { | |||
| #if !defined(CONJ) | |||
| y[ix] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
| y[ix + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
| #else | |||
| y[ix] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
| y[ix + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
| #endif | |||
| i++; | |||
| ix += 2; | |||
| } | |||
| return (0); | |||
| } | |||
| inc_x *= 2; | |||
| inc_y *= 2; | |||
| while (i < n) { | |||
| #if !defined(CONJ) | |||
| y[iy] += (da_r * x[ix] - da_i * x[ix + 1]); | |||
| y[iy + 1] += (da_r * x[ix + 1] + da_i * x[ix]); | |||
| #else | |||
| y[iy] += (da_r * x[ix] + da_i * x[ix + 1]); | |||
| y[iy + 1] -= (da_r * x[ix + 1] - da_i * x[ix]); | |||
| #endif | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| @@ -0,0 +1,164 @@ | |||
| /*Copyright (c) 2013-201\n8, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #ifndef HAVE_KERNEL_8 | |||
| #include <altivec.h> | |||
| static void cdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, float *dot) | |||
| { | |||
| __vector unsigned char swap_mask = { 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| register __vector float *vy = (__vector float *) y; | |||
| register __vector float *vx = (__vector float *) x; | |||
| BLASLONG i = 0; | |||
| register __vector float vd_0 = { 0 }; | |||
| register __vector float vd_1 = { 0 }; | |||
| register __vector float vd_2 = { 0 }; | |||
| register __vector float vd_3 = { 0 }; | |||
| register __vector float vdd_0 = { 0 }; | |||
| register __vector float vdd_1 = { 0 }; | |||
| register __vector float vdd_2 = { 0 }; | |||
| register __vector float vdd_3 = { 0 }; | |||
| for (; i < n/2; i += 4) { | |||
| register __vector float vyy_0 ; | |||
| register __vector float vyy_1 ; | |||
| register __vector float vyy_2 ; | |||
| register __vector float vyy_3 ; | |||
| register __vector float vy_0 = vy[i]; | |||
| register __vector float vy_1 = vy[i + 1]; | |||
| register __vector float vy_2 = vy[i + 2]; | |||
| register __vector float vy_3 = vy[i + 3]; | |||
| register __vector float vx_0= vx[i]; | |||
| register __vector float vx_1 = vx[i + 1]; | |||
| register __vector float vx_2 = vx[i + 2]; | |||
| register __vector float vx_3 = vx[i + 3]; | |||
| vyy_0 = vec_perm(vy_0, vy_0, swap_mask); | |||
| vyy_1 = vec_perm(vy_1, vy_1, swap_mask); | |||
| vyy_2 = vec_perm(vy_2, vy_2, swap_mask); | |||
| vyy_3 = vec_perm(vy_3, vy_3, swap_mask); | |||
| vd_0 += vx_0 * vy_0; | |||
| vd_1 += vx_1 * vy_1; | |||
| vd_2 += vx_2 * vy_2; | |||
| vd_3 += vx_3 * vy_3; | |||
| vdd_0 += vx_0 * vyy_0; | |||
| vdd_1 += vx_1 * vyy_1; | |||
| vdd_2 += vx_2 * vyy_2; | |||
| vdd_3 += vx_3 * vyy_3; | |||
| } | |||
| //aggregate | |||
| vd_0 = vd_0 + vd_1 +vd_2 +vd_3; | |||
| vdd_0= vdd_0 + vdd_1 +vdd_2 +vdd_3; | |||
| //reverse and aggregate | |||
| vd_1=vec_xxpermdi(vd_0,vd_0,2) ; | |||
| vdd_1=vec_xxpermdi(vdd_0,vdd_0,2); | |||
| vd_2=vd_0+vd_1; | |||
| vdd_2=vdd_0+vdd_1; | |||
| dot[0]=vd_2[0]; | |||
| dot[1]=vd_2[1]; | |||
| dot[2]=vdd_2[0]; | |||
| dot[3]=vdd_2[1]; | |||
| } | |||
| #endif | |||
| OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) { | |||
| BLASLONG i = 0; | |||
| BLASLONG ix=0, iy=0; | |||
| OPENBLAS_COMPLEX_FLOAT result; | |||
| FLOAT dot[4] __attribute__ ((aligned(16))) = {0.0, 0.0, 0.0, 0.0}; | |||
| if (n <= 0) { | |||
| CREAL(result) = 0.0; | |||
| CIMAG(result) = 0.0; | |||
| return (result); | |||
| } | |||
| if ((inc_x == 1) && (inc_y == 1)) { | |||
| BLASLONG n1 = n & -8; | |||
| BLASLONG j=0; | |||
| if (n1){ | |||
| cdot_kernel_8(n1, x, y, dot); | |||
| i = n1; | |||
| j = n1 <<1; | |||
| } | |||
| while (i < n) { | |||
| dot[0] += x[j] * y[j]; | |||
| dot[1] += x[j + 1] * y[j + 1]; | |||
| dot[2] += x[j] * y[j + 1]; | |||
| dot[3] += x[j + 1] * y[j]; | |||
| j += 2; | |||
| i++; | |||
| } | |||
| } else { | |||
| i = 0; | |||
| ix = 0; | |||
| iy = 0; | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| while (i < n) { | |||
| dot[0] += x[ix] * y[iy]; | |||
| dot[1] += x[ix + 1] * y[iy + 1]; | |||
| dot[2] += x[ix] * y[iy + 1]; | |||
| dot[3] += x[ix + 1] * y[iy]; | |||
| ix += inc_x; | |||
| iy += inc_y; | |||
| i++; | |||
| } | |||
| } | |||
| #if !defined(CONJ) | |||
| CREAL(result) = dot[0] - dot[1]; | |||
| CIMAG(result) = dot[2] + dot[3]; | |||
| #else | |||
| CREAL(result) = dot[0] + dot[1]; | |||
| CIMAG(result) = dot[2] - dot[3]; | |||
| #endif | |||
| return (result); | |||
| } | |||
| @@ -0,0 +1,585 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| #define NBMAX 1024 | |||
| static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
| register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
| register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||
| register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||
| register __vector float vx2_r = {x[4], x[4],x[4], x[4]}; | |||
| register __vector float vx2_i = {-x[5], x[5],-x[5], x[5]}; | |||
| register __vector float vx3_r = {x[6], x[6],x[6], x[6]}; | |||
| register __vector float vx3_i = {-x[7], x[7],-x[7], x[7]}; | |||
| #else | |||
| register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
| register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
| register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||
| register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||
| register __vector float vx2_r = {x[4], -x[4],x[4], -x[4]}; | |||
| register __vector float vx2_i = {x[5], x[5],x[5], x[5]}; | |||
| register __vector float vx3_r = {x[6], -x[6],x[6], -x[6]}; | |||
| register __vector float vx3_i = {x[7], x[7],x[7], x[7]}; | |||
| #endif | |||
| register __vector float *vy = (__vector float *) y; | |||
| register __vector float *vptr_a0 = (__vector float *) a0; | |||
| register __vector float *vptr_a1 = (__vector float *) a1; | |||
| register __vector float *vptr_a2 = (__vector float *) a2; | |||
| register __vector float *vptr_a3 = (__vector float *) a3; | |||
| BLASLONG i = 0; | |||
| for (;i< n / 2; i+=2) { | |||
| register __vector float vy_0 = vy[i]; | |||
| register __vector float vy_1 = vy[i + 1]; | |||
| register __vector float va0 = vptr_a0[i]; | |||
| register __vector float va1 = vptr_a1[i]; | |||
| register __vector float va2 = vptr_a2[i]; | |||
| register __vector float va3 = vptr_a3[i]; | |||
| register __vector float va0_1 = vptr_a0[i + 1]; | |||
| register __vector float va1_1 = vptr_a1[i + 1]; | |||
| register __vector float va2_1 = vptr_a2[i + 1]; | |||
| register __vector float va3_1 = vptr_a3[i + 1]; | |||
| vy_0 += va0*vx0_r + va1*vx1_r + va2*vx2_r + va3*vx3_r; | |||
| vy_1 += va0_1*vx0_r + va1_1*vx1_r + va2_1*vx2_r + va3_1*vx3_r; | |||
| va0 = vec_perm(va0, va0,swap_mask); | |||
| va0_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
| va1 = vec_perm(va1, va1,swap_mask); | |||
| va1_1 = vec_perm(va1_1, va1_1,swap_mask); | |||
| va2 = vec_perm(va2, va2,swap_mask); | |||
| va2_1 = vec_perm(va2_1, va2_1,swap_mask); | |||
| va3 = vec_perm(va3, va3,swap_mask); | |||
| va3_1 = vec_perm(va3_1, va3_1,swap_mask); | |||
| vy_0 += va0*vx0_i + va1*vx1_i + va2*vx2_i + va3*vx3_i; | |||
| vy_1 += va0_1*vx0_i + va1_1*vx1_i + va2_1*vx2_i + va3_1*vx3_i; | |||
| vy[i] = vy_0; | |||
| vy[i + 1] = vy_1; | |||
| } | |||
| } | |||
| static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
| register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
| register __vector float vx1_r = {x[2], x[2],x[2], x[2]}; | |||
| register __vector float vx1_i = {-x[3], x[3],-x[3], x[3]}; | |||
| #else | |||
| register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
| register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
| register __vector float vx1_r = {x[2], -x[2],x[2], -x[2]}; | |||
| register __vector float vx1_i = {x[3], x[3],x[3], x[3]}; | |||
| #endif | |||
| register __vector float *vy = (__vector float *) y; | |||
| register __vector float *vptr_a0 = (__vector float *) a0; | |||
| register __vector float *vptr_a1 = (__vector float *) a1; | |||
| BLASLONG i = 0; | |||
| for (;i< n / 2; i+=2) { | |||
| register __vector float vy_0 = vy[i]; | |||
| register __vector float vy_1 = vy[i + 1]; | |||
| register __vector float va0 = vptr_a0[i]; | |||
| register __vector float va1 = vptr_a1[i]; | |||
| register __vector float va0_1 = vptr_a0[i + 1]; | |||
| register __vector float va1_1 = vptr_a1[i + 1]; | |||
| register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||
| register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
| register __vector float va1x = vec_perm(va1, va1,swap_mask); | |||
| register __vector float va1x_1 = vec_perm(va1_1, va1_1,swap_mask); | |||
| vy_0 += va0*vx0_r + va1*vx1_r + va0x*vx0_i + va1x*vx1_i; | |||
| vy_1 += va0_1*vx0_r + va1_1*vx1_r + va0x_1*vx0_i + va1x_1*vx1_i; | |||
| vy[i] = vy_0; | |||
| vy[i + 1] = vy_1; | |||
| } | |||
| } | |||
| static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register __vector float vx0_r = {x[0], x[0],x[0], x[0]}; | |||
| register __vector float vx0_i = {-x[1], x[1],-x[1], x[1]}; | |||
| #else | |||
| register __vector float vx0_r = {x[0], -x[0],x[0], -x[0]}; | |||
| register __vector float vx0_i = {x[1], x[1],x[1], x[1]}; | |||
| #endif | |||
| register __vector float *vy = (__vector float *) y; | |||
| register __vector float *vptr_a0 = (__vector float *) ap; | |||
| BLASLONG i = 0; | |||
| for (;i< n / 2; i+=2) { | |||
| register __vector float vy_0 = vy[i]; | |||
| register __vector float vy_1 = vy[i + 1]; | |||
| register __vector float va0 = vptr_a0[i]; | |||
| register __vector float va0_1 = vptr_a0[i + 1]; | |||
| register __vector float va0x = vec_perm(va0, va0,swap_mask); | |||
| register __vector float va0x_1 = vec_perm(va0_1, va0_1,swap_mask); | |||
| vy_0 += va0*vx0_r + va0x*vx0_i; | |||
| vy_1 += va0_1*vx0_r + va0x_1*vx0_i; | |||
| vy[i] = vy_0; | |||
| vy[i + 1] = vy_1; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { | |||
| BLASLONG i; | |||
| if (inc_dest != 2) { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| #if !defined(XCONJ) | |||
| temp_r = alpha_r * src[0] - alpha_i * src[1]; | |||
| temp_i = alpha_r * src[1] + alpha_i * src[0]; | |||
| #else | |||
| temp_r = alpha_r * src[0] + alpha_i * src[1]; | |||
| temp_i = -alpha_r * src[1] + alpha_i * src[0]; | |||
| #endif | |||
| *dest += temp_r; | |||
| *(dest+1) += temp_i; | |||
| src+=2; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } else { | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| #if !defined(XCONJ) | |||
| register __vector float valpha_r = {alpha_r, alpha_r, alpha_r, alpha_r}; | |||
| register __vector float valpha_i = {-alpha_i, alpha_i, -alpha_i, alpha_i}; | |||
| #else | |||
| register __vector float valpha_r = {alpha_r, -alpha_r, alpha_r, -alpha_r}; | |||
| register __vector float valpha_i = {alpha_i, alpha_i, alpha_i, alpha_i}; | |||
| #endif | |||
| register __vector float *vptr_src = (__vector float *) src; | |||
| register __vector float *vptr_y = (__vector float *) dest; | |||
| for (i = 0; i < n/2; i += 2 ){ | |||
| register __vector float vy_0 = vptr_y[i]; | |||
| register __vector float vy_1 = vptr_y[i +1]; | |||
| register __vector float vsrc = vptr_src[i]; | |||
| register __vector float vsrc_1 = vptr_src[i + 1]; | |||
| register __vector float vsrcx = vec_perm(vsrc, vsrc, swap_mask); | |||
| register __vector float vsrcx_1 = vec_perm(vsrc_1, vsrc_1, swap_mask); | |||
| vy_0 += vsrc*valpha_r + vsrcx*valpha_i; | |||
| vy_1 += vsrc_1*valpha_r + vsrcx_1*valpha_i; | |||
| vptr_y[i] = vy_0; | |||
| vptr_y[i+1 ] = vy_1; | |||
| } | |||
| } | |||
| return; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { | |||
| BLASLONG i; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT xbuffer[8], *ybuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| ybuffer = buffer; | |||
| inc_x *= 2; | |||
| inc_y *= 2; | |||
| lda *= 2; | |||
| n1 = n / 4; | |||
| n2 = n % 4; | |||
| m3 = m % 4; | |||
| m1 = m - (m % 4); | |||
| m2 = (m % NBMAX) - (m % 4); | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| memset(ybuffer, 0, NB * 2*sizeof(FLOAT)); | |||
| if (inc_x == 2) { | |||
| for (i = 0; i < n1; i++) { | |||
| cgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); | |||
| a_ptr += lda << 2; | |||
| x_ptr += 8; | |||
| } | |||
| if (n2 & 2) { | |||
| cgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); | |||
| x_ptr += 4; | |||
| a_ptr += 2 * lda; | |||
| } | |||
| if (n2 & 1) { | |||
| cgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); | |||
| x_ptr += 2; | |||
| a_ptr += lda; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| xbuffer[1] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| xbuffer[3] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[4] = x_ptr[0]; | |||
| xbuffer[5] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| xbuffer[6] = x_ptr[0]; | |||
| xbuffer[7] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer); | |||
| a_ptr += lda << 2; | |||
| } | |||
| for (i = 0; i < n2; i++) { | |||
| xbuffer[0] = x_ptr[0]; | |||
| xbuffer[1] = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); | |||
| a += 2 * NB; | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| if (m3 == 1) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r = 0.0; | |||
| FLOAT temp_i = 0.0; | |||
| if (lda == 2 && inc_x == 2) { | |||
| for (i = 0; i < (n & -2); i += 2) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; | |||
| temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; | |||
| temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; | |||
| #endif | |||
| a_ptr += 4; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 2; | |||
| x_ptr += 2; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| #else | |||
| temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r0 = 0.0; | |||
| FLOAT temp_i0 = 0.0; | |||
| FLOAT temp_r1 = 0.0; | |||
| FLOAT temp_i1 = 0.0; | |||
| if (lda == 4 && inc_x == 2) { | |||
| for (i = 0; i < (n & -2); i += 2) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; | |||
| temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; | |||
| temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; | |||
| temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; | |||
| temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; | |||
| temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; | |||
| #endif | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for (; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 4; | |||
| x_ptr += 2; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| #endif | |||
| return (0); | |||
| } | |||
| if (m3 == 3) { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp_r0 = 0.0; | |||
| FLOAT temp_i0 = 0.0; | |||
| FLOAT temp_r1 = 0.0; | |||
| FLOAT temp_i1 = 0.0; | |||
| FLOAT temp_r2 = 0.0; | |||
| FLOAT temp_i2 = 0.0; | |||
| if (lda == 6 && inc_x == 2) { | |||
| for (i = 0; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += 6; | |||
| x_ptr += 2; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n; i++) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; | |||
| #else | |||
| temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; | |||
| temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; | |||
| temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; | |||
| temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; | |||
| #endif | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
| y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||
| y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||
| #endif | |||
| return (0); | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,571 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #define NBMAX 1024 | |||
| #include <altivec.h> | |||
| static const unsigned char swap_mask_arr[]={ 4,5,6,7,0,1,2,3, 12,13,14,15, 8,9,10,11}; | |||
| static void cgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1, *a2, *a3; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| a2 = a1 + lda; | |||
| a3 = a2 + lda; | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
| register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp2_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp2_r = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp3_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp3_r = {0.0, 0.0,0.0,0.0}; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* va2 = (__vector float*) a2; | |||
| __vector float* va3 = (__vector float*) a3; | |||
| __vector float* v_x = (__vector float*) x; | |||
| for (i = 0; i < n / 2; i+=2) { | |||
| register __vector float vx_0 = v_x[i]; | |||
| register __vector float vx_1 = v_x[i+1]; | |||
| register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
| register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
| vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
| vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
| vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||
| vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||
| vtemp2_p += vx_0*va2[i] + vx_1*va2[i+1]; | |||
| vtemp2_r += vxr_0*va2[i] + vxr_1*va2[i+1]; | |||
| vtemp3_p += vx_0*va3[i] + vx_1*va3[i+1]; | |||
| vtemp3_r += vxr_0*va3[i] + vxr_1*va3[i+1]; | |||
| } | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
| register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||
| register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||
| register FLOAT temp_r2 = vtemp2_p[0] - vtemp2_p[1] + vtemp2_p[2] - vtemp2_p[3]; | |||
| register FLOAT temp_i2 = vtemp2_r[0] + vtemp2_r[1] + vtemp2_r[2] + vtemp2_r[3]; | |||
| register FLOAT temp_r3 = vtemp3_p[0] - vtemp3_p[1] + vtemp3_p[2] - vtemp3_p[3]; | |||
| register FLOAT temp_i3 = vtemp3_r[0] + vtemp3_r[1] + vtemp3_r[2] + vtemp3_r[3]; | |||
| #else | |||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
| register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||
| register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||
| register FLOAT temp_r2 = vtemp2_p[0] + vtemp2_p[1] + vtemp2_p[2] + vtemp2_p[3]; | |||
| register FLOAT temp_i2 = vtemp2_r[0] - vtemp2_r[1] + vtemp2_r[2] - vtemp2_r[3]; | |||
| register FLOAT temp_r3 = vtemp3_p[0] + vtemp3_p[1] + vtemp3_p[2] + vtemp3_p[3]; | |||
| register FLOAT temp_i3 = vtemp3_r[0] - vtemp3_r[1] + vtemp3_r[2] - vtemp3_r[3]; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; | |||
| y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; | |||
| y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; | |||
| y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; | |||
| #else | |||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; | |||
| y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; | |||
| y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; | |||
| y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; | |||
| #endif | |||
| } | |||
| static void cgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
| BLASLONG i; | |||
| FLOAT *a0, *a1; | |||
| a0 = ap; | |||
| a1 = ap + lda; | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
| register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp1_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp1_r = {0.0, 0.0,0.0,0.0}; | |||
| __vector float* va0 = (__vector float*) a0; | |||
| __vector float* va1 = (__vector float*) a1; | |||
| __vector float* v_x = (__vector float*) x; | |||
| for (i = 0; i < n / 2; i+=2) { | |||
| register __vector float vx_0 = v_x[i]; | |||
| register __vector float vx_1 = v_x[i+1]; | |||
| register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
| register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
| vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
| vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
| vtemp1_p += vx_0*va1[i] + vx_1*va1[i+1]; | |||
| vtemp1_r += vxr_0*va1[i] + vxr_1*va1[i+1]; | |||
| } | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
| register FLOAT temp_r1 = vtemp1_p[0] - vtemp1_p[1] + vtemp1_p[2] - vtemp1_p[3]; | |||
| register FLOAT temp_i1 = vtemp1_r[0] + vtemp1_r[1] + vtemp1_r[2] + vtemp1_r[3]; | |||
| #else | |||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
| register FLOAT temp_r1 = vtemp1_p[0] + vtemp1_p[1] + vtemp1_p[2] + vtemp1_p[3]; | |||
| register FLOAT temp_i1 = vtemp1_r[0] - vtemp1_r[1] + vtemp1_r[2] - vtemp1_r[3]; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| #else | |||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| #endif | |||
| } | |||
| static void cgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { | |||
| BLASLONG i; | |||
| __vector unsigned char swap_mask = *((__vector unsigned char*)swap_mask_arr); | |||
| //p for positive(real*real,image*image,real*real,image*image) r for image (real*image,image*real,real*image,image*real) | |||
| register __vector float vtemp0_p = {0.0, 0.0,0.0,0.0}; | |||
| register __vector float vtemp0_r = {0.0, 0.0,0.0,0.0}; | |||
| __vector float* va0 = (__vector float*) ap; | |||
| __vector float* v_x = (__vector float*) x; | |||
| for (i = 0; i < n / 2; i+=2) { | |||
| register __vector float vx_0 = v_x[i]; | |||
| register __vector float vx_1 = v_x[i+1]; | |||
| register __vector float vxr_0 = vec_perm(vx_0, vx_0, swap_mask); | |||
| register __vector float vxr_1 = vec_perm(vx_1, vx_1, swap_mask); | |||
| vtemp0_p += vx_0*va0[i] + vx_1*va0[i+1] ; | |||
| vtemp0_r += vxr_0*va0[i] + vxr_1*va0[i+1]; | |||
| } | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| register FLOAT temp_r0 = vtemp0_p[0] - vtemp0_p[1] + vtemp0_p[2] - vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] + vtemp0_r[1] + vtemp0_r[2] + vtemp0_r[3]; | |||
| #else | |||
| register FLOAT temp_r0 = vtemp0_p[0] + vtemp0_p[1] + vtemp0_p[2] + vtemp0_p[3]; | |||
| register FLOAT temp_i0 = vtemp0_r[0] - vtemp0_r[1] + vtemp0_r[2] - vtemp0_r[3]; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; | |||
| y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; | |||
| #else | |||
| y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; | |||
| y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; | |||
| #endif | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) { | |||
| BLASLONG i; | |||
| for (i = 0; i < n; i++) { | |||
| *dest = *src; | |||
| *(dest + 1) = *(src + 1); | |||
| dest += 2; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[8], *xbuffer; | |||
| if (m < 1) return (0); | |||
| if (n < 1) return (0); | |||
| inc_x <<= 1; | |||
| inc_y <<= 1; | |||
| lda <<= 1; | |||
| xbuffer = buffer; | |||
| n1 = n >> 2; | |||
| n2 = n & 3; | |||
| m3 = m & 3; | |||
| m1 = m - m3; | |||
| m2 = (m & (NBMAX - 1)) - m3; | |||
| BLASLONG NB = NBMAX; | |||
| while (NB == NBMAX) { | |||
| m1 -= NB; | |||
| if (m1 < 0) { | |||
| if (m2 == 0) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if (inc_x != 2) | |||
| copy_x(NB, x_ptr, xbuffer, inc_x); | |||
| else | |||
| xbuffer = x_ptr; | |||
| if (inc_y == 2) { | |||
| for (i = 0; i < n1; i++) { | |||
| cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
| a_ptr += lda << 2; | |||
| y_ptr += 8; | |||
| } | |||
| if (n2 & 2) { | |||
| cgemv_kernel_4x2(NB, lda, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
| a_ptr += lda << 1; | |||
| y_ptr += 4; | |||
| } | |||
| if (n2 & 1) { | |||
| cgemv_kernel_4x1(NB, a_ptr, xbuffer, y_ptr, alpha_r, alpha_i); | |||
| a_ptr += lda; | |||
| y_ptr += 2; | |||
| } | |||
| } else { | |||
| for (i = 0; i < n1; i++) { | |||
| memset(ybuffer, 0, sizeof (ybuffer)); | |||
| cgemv_kernel_4x4(NB, lda, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||
| a_ptr += lda << 2; | |||
| y_ptr[0] += ybuffer[0]; | |||
| y_ptr[1] += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[2]; | |||
| y_ptr[1] += ybuffer[3]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[4]; | |||
| y_ptr[1] += ybuffer[5]; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += ybuffer[6]; | |||
| y_ptr[1] += ybuffer[7]; | |||
| y_ptr += inc_y; | |||
| } | |||
| for (i = 0; i < n2; i++) { | |||
| memset(ybuffer, 0, sizeof (ybuffer)); | |||
| cgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer, alpha_r, alpha_i); | |||
| a_ptr += lda; | |||
| y_ptr[0] += ybuffer[0]; | |||
| y_ptr[1] += ybuffer[1]; | |||
| y_ptr += inc_y; | |||
| } | |||
| } | |||
| a += 2 * NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if (m3 == 0) return (0); | |||
| x_ptr = x; | |||
| j = 0; | |||
| a_ptr = a; | |||
| y_ptr = y; | |||
| if (m3 == 3) { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x2 = x_ptr[0]; | |||
| FLOAT x3 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x4 = x_ptr[0]; | |||
| FLOAT x5 = x_ptr[1]; | |||
| while (j < n) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| temp_r += a_ptr[4] * x4 - a_ptr[5] * x5; | |||
| temp_i += a_ptr[4] * x5 + a_ptr[5] * x4; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| temp_r += a_ptr[4] * x4 + a_ptr[5] * x5; | |||
| temp_i += a_ptr[4] * x5 - a_ptr[5] * x4; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 2) { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| FLOAT temp_r1; | |||
| FLOAT temp_i1; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| x_ptr += inc_x; | |||
| FLOAT x2 = x_ptr[0]; | |||
| FLOAT x3 = x_ptr[1]; | |||
| while (j < (n & -2)) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r1 += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i1 += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r1 += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i1 += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 - a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 + a_ptr[3] * x2; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| temp_r += a_ptr[2] * x2 + a_ptr[3] * x3; | |||
| temp_i += a_ptr[2] * x3 - a_ptr[3] * x2; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return (0); | |||
| } | |||
| if (m3 == 1) { | |||
| FLOAT temp_r; | |||
| FLOAT temp_i; | |||
| FLOAT temp_r1; | |||
| FLOAT temp_i1; | |||
| FLOAT x0 = x_ptr[0]; | |||
| FLOAT x1 = x_ptr[1]; | |||
| while (j < (n & -2)) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| a_ptr += lda; | |||
| temp_r1 = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i1 = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; | |||
| y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; | |||
| y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j += 2; | |||
| } | |||
| while (j < n) { | |||
| #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) | |||
| temp_r = a_ptr[0] * x0 - a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 + a_ptr[1] * x0; | |||
| #else | |||
| temp_r = a_ptr[0] * x0 + a_ptr[1] * x1; | |||
| temp_i = a_ptr[0] * x1 - a_ptr[1] * x0; | |||
| #endif | |||
| #if !defined(XCONJ) | |||
| y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; | |||
| y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; | |||
| #else | |||
| y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; | |||
| y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; | |||
| #endif | |||
| a_ptr += lda; | |||
| y_ptr += inc_y; | |||
| j++; | |||
| } | |||
| return (0); | |||
| } | |||
| return (0); | |||
| } | |||
| @@ -0,0 +1,231 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2018, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| __vector float t0; | |||
| __vector float t1; | |||
| __vector float t2; | |||
| __vector float t3; | |||
| __vector float t4; | |||
| __vector float t5; | |||
| __vector float t6; | |||
| __vector float t7; | |||
| __asm__ | |||
| ( | |||
| "xscvdpspn 36, %x[cos] \n\t" // load c to all words | |||
| "xxspltw 36, 36, 0 \n\t" | |||
| "xscvdpspn 37, %x[sin] \n\t" // load s to all words | |||
| "xxspltw 37, 37, 0 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 64 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "ble 2f \n\t" | |||
| ".p2align 5 \n\t" | |||
| "1: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "lxvd2x 32, 0, %[x_ptr] \n\t" // load x | |||
| "lxvd2x 33, %[i16], %[x_ptr] \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "lxvd2x 34, %[i32], %[x_ptr] \n\t" | |||
| "lxvd2x 35, %[i48], %[x_ptr] \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "lxvd2x 48, 0, %[y_ptr] \n\t" // load y | |||
| "lxvd2x 49, %[i16], %[y_ptr] \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "lxvd2x 50, %[i32], %[y_ptr] \n\t" | |||
| "lxvd2x 51, %[i48], %[y_ptr] \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] \n\t" | |||
| "addi %[x_ptr], %[x_ptr], 128 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], 128 \n\t" | |||
| "addic. %[temp_n], %[temp_n], -8 \n\t" | |||
| "bgt 1b \n\t" | |||
| "2: \n\t" | |||
| "xvmulsp 40, 32, 36 \n\t" // c * x | |||
| "xvmulsp 41, 33, 36 \n\t" | |||
| "xvmulsp 42, 34, 36 \n\t" | |||
| "xvmulsp 43, 35, 36 \n\t" | |||
| "xvmulsp %x[x0], 48, 36 \n\t" // c * y | |||
| "xvmulsp %x[x2], 49, 36 \n\t" | |||
| "xvmulsp %x[x1], 50, 36 \n\t" | |||
| "xvmulsp %x[x3], 51, 36 \n\t" | |||
| "xvmulsp 44, 32, 37 \n\t" // s * x | |||
| "xvmulsp 45, 33, 37 \n\t" | |||
| "xvmulsp 46, 34, 37 \n\t" | |||
| "xvmulsp 47, 35, 37 \n\t" | |||
| "xvmulsp %x[x4], 48, 37 \n\t" // s * y | |||
| "xvmulsp %x[x5], 49, 37 \n\t" | |||
| "xvmulsp %x[x6], 50, 37 \n\t" | |||
| "xvmulsp %x[x7], 51, 37 \n\t" | |||
| "addi %[x_ptr], %[x_ptr], -64 \n\t" | |||
| "addi %[y_ptr], %[y_ptr], -64 \n\t" | |||
| "xvaddsp 40, 40, %x[x4] \n\t" // c * x + s * y | |||
| "xvaddsp 41, 41, %x[x5] \n\t" // c * x + s * y | |||
| "xvaddsp 42, 42, %x[x6] \n\t" // c * x + s * y | |||
| "xvaddsp 43, 43, %x[x7] \n\t" // c * x + s * y | |||
| "xvsubsp %x[x0], %x[x0], 44 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x2], %x[x2], 45 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x1], %x[x1], 46 \n\t" // c * y - s * x | |||
| "xvsubsp %x[x3], %x[x3], 47 \n\t" // c * y - s * x | |||
| "stxvd2x 40, 0, %[x_ptr] \n\t" // store x | |||
| "stxvd2x 41, %[i16], %[x_ptr] \n\t" | |||
| "stxvd2x 42, %[i32], %[x_ptr] \n\t" | |||
| "stxvd2x 43, %[i48], %[x_ptr] \n\t" | |||
| "stxvd2x %x[x0], 0, %[y_ptr] \n\t" // store y | |||
| "stxvd2x %x[x2], %[i16], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x1], %[i32], %[y_ptr] \n\t" | |||
| "stxvd2x %x[x3], %[i48], %[y_ptr] " | |||
| : | |||
| [mem_x] "+m" (*(float (*)[2*n])x), | |||
| [mem_y] "+m" (*(float (*)[2*n])y), | |||
| [temp_n] "+r" (n), | |||
| [x_ptr] "+&b" (x), | |||
| [y_ptr] "+&b" (y), | |||
| [x0] "=wa" (t0), | |||
| [x1] "=wa" (t2), | |||
| [x2] "=wa" (t1), | |||
| [x3] "=wa" (t3), | |||
| [x4] "=wa" (t4), | |||
| [x5] "=wa" (t5), | |||
| [x6] "=wa" (t6), | |||
| [x7] "=wa" (t7) | |||
| : | |||
| [cos] "f" (c), | |||
| [sin] "f" (s), | |||
| [i16] "b" (16), | |||
| [i32] "b" (32), | |||
| [i48] "b" (48) | |||
| : | |||
| "cr0", | |||
| "vs32","vs33","vs34","vs35","vs36","vs37", | |||
| "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", | |||
| "vs48","vs49","vs50","vs51" | |||
| ); | |||
| } | |||
| #endif | |||
| int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT c, FLOAT s) | |||
| { | |||
| BLASLONG i=0; | |||
| BLASLONG ix=0,iy=0; | |||
| FLOAT temp[2]; | |||
| BLASLONG inc_x2; | |||
| BLASLONG inc_y2; | |||
| if ( n <= 0 ) return(0); | |||
| if ( (inc_x == 1) && (inc_y == 1) ) | |||
| { | |||
| BLASLONG n1 = n & -8; | |||
| if ( n1 > 0 ) | |||
| { | |||
| crot_kernel_8(n1, x, y, c, s); | |||
| i=n1; | |||
| ix=2*n1; | |||
| } | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[ix] ; | |||
| temp[1] = c*x[ix+1] + s*y[ix+1] ; | |||
| y[ix] = c*y[ix] - s*x[ix] ; | |||
| y[ix+1] = c*y[ix+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += 2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| inc_x2 = 2 * inc_x ; | |||
| inc_y2 = 2 * inc_y ; | |||
| while(i < n) | |||
| { | |||
| temp[0] = c*x[ix] + s*y[iy] ; | |||
| temp[1] = c*x[ix+1] + s*y[iy+1] ; | |||
| y[iy] = c*y[iy] - s*x[ix] ; | |||
| y[iy+1] = c*y[iy+1] - s*x[ix+1] ; | |||
| x[ix] = temp[0] ; | |||
| x[ix+1] = temp[1] ; | |||
| ix += inc_x2 ; | |||
| iy += inc_y2 ; | |||
| i++ ; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "cswap_microk_power8.c" | |||
| #endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dasum_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "daxpy_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dcopy_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "ddot_microk_power8.c" | |||
| #endif | |||
| @@ -0,0 +1,249 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2019, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define ALPHA_SP (296+192)(SP) | |||
| #define FZERO (304+192)(SP) | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r7 | |||
| #define B r8 | |||
| #define C r9 | |||
| #define LDC r10 | |||
| #define OFFSET r6 | |||
| #define alpha_r vs18 | |||
| #define o0 0 | |||
| #define T4 r12 | |||
| #define T3 r11 | |||
| #define C4 r14 | |||
| #define o8 r15 | |||
| #define o24 r16 | |||
| #define C2 r17 | |||
| #define L r18 | |||
| #define T1 r19 | |||
| #define C3 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define o16 r27 | |||
| #define o32 r28 | |||
| #define o48 r29 | |||
| #define PRE r30 | |||
| #define T2 r31 | |||
| #include "dgemm_macros_power9.S" | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| li r0, 0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv v20, 288(SP) | |||
| stxv v21, 304(SP) | |||
| stxv v22, 320(SP) | |||
| stxv v23, 336(SP) | |||
| stxv v24, 352(SP) | |||
| stxv v25, 368(SP) | |||
| stxv v26, 384(SP) | |||
| stxv v27, 400(SP) | |||
| stxv v28, 416(SP) | |||
| stxv v29, 432(SP) | |||
| stxv v30, 448(SP) | |||
| stxv v31, 464(SP) | |||
| stfd f1, ALPHA_SP | |||
| stw r0, FZERO | |||
| slwi LDC, LDC, BASE_SHIFT | |||
| #if defined(TRMMKERNEL) | |||
| ld OFFSET, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #endif | |||
| cmpwi cr0, M, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, N, 0 | |||
| ble .L999_H1 | |||
| cmpwi cr0, K, 0 | |||
| ble .L999_H1 | |||
| addi T1, SP, 296+192 | |||
| li PRE, 384 | |||
| li o8 , 8 | |||
| li o16, 16 | |||
| li o24, 24 | |||
| li o32, 32 | |||
| li o48, 48 | |||
| lxvdsx alpha_r, 0, T1 | |||
| #include "dgemm_logic_power9.S" | |||
| .L999: | |||
| addi r3, 0, 0 | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| lxv v20, 288(SP) | |||
| lxv v21, 304(SP) | |||
| lxv v22, 320(SP) | |||
| lxv v23, 336(SP) | |||
| lxv v24, 352(SP) | |||
| lxv v25, 368(SP) | |||
| lxv v26, 384(SP) | |||
| lxv v27, 400(SP) | |||
| lxv v28, 416(SP) | |||
| lxv v29, 432(SP) | |||
| lxv v30, 448(SP) | |||
| lxv v31, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #include "dgemv_n_microk_power8.c" | |||
| #endif | |||