| @@ -149,7 +149,7 @@ matrix: | |||
| - &test-macos | |||
| os: osx | |||
| osx_image: xcode8.3 | |||
| osx_image: xcode10.1 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - brew update | |||
| @@ -160,6 +160,7 @@ matrix: | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-macos | |||
| osx_image: xcode8.3 | |||
| env: | |||
| - BTYPE="BINARY=32" | |||
| @@ -42,6 +42,19 @@ endif() | |||
| ####### | |||
| if(MSVC AND MSVC_STATIC_CRT) | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| @@ -62,10 +75,10 @@ endif () | |||
| set(SUBDIRS ${BLASDIRS}) | |||
| if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| if(BUILD_RELAPACK) | |||
| list(APPEND SUBDIRS relapack/src) | |||
| endif() | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| # set which float types we want to build for | |||
| @@ -134,7 +147,7 @@ endif () | |||
| # Only generate .def for dll on MSVC and always produce pdb files for debug and release | |||
| if(MSVC) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
| set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def") | |||
| endif() | |||
| set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi") | |||
| @@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH}) | |||
| endforeach() | |||
| endif () | |||
| # Only build shared libs for MSVC | |||
| if (MSVC) | |||
| set(BUILD_SHARED_LIBS ON) | |||
| endif() | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| @@ -166,7 +173,7 @@ endif() | |||
| # Handle MSVC exports | |||
| if(MSVC AND BUILD_SHARED_LIBS) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4) | |||
| if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
| else() | |||
| # Creates verbose .def file (51KB vs 18KB) | |||
| @@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| @@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| if(NOT NOFORTRAN) | |||
| message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h) | |||
| set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h) | |||
| file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n") | |||
| file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n") | |||
| file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n") | |||
| @@ -327,10 +342,11 @@ endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h) | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| @@ -96,7 +96,7 @@ endif | |||
| @echo | |||
| shared : | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED), 1) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @$(MAKE) -C exports so | |||
| @ln -fs $(LIBSONAME) $(LIBPREFIX).so | |||
| @@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| @@ -58,14 +58,14 @@ ifndef NO_LAPACKE | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @@ -106,14 +106,14 @@ ifndef NO_LAPACKE | |||
| endif | |||
| #for install static library | |||
| ifndef NO_STATIC | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| #for install shared library | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| @@ -138,7 +138,7 @@ endif | |||
| @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| ifndef NO_SHARED | |||
| ifneq ($(NO_SHARED),1) | |||
| #ifeq logical or | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly)) | |||
| @echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)" | |||
| @@ -48,6 +48,8 @@ VERSION = 0.3.6.dev | |||
| # HOSTCC = gcc | |||
| # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64 | |||
| # Please note that AVX is not available on 32-bit. | |||
| # Setting BINARY=32 disables AVX/AVX2/AVX-512. | |||
| # BINARY=64 | |||
| # About threaded BLAS. It will be automatically detected if you don't | |||
| @@ -57,7 +59,7 @@ VERSION = 0.3.6.dev | |||
| # USE_THREAD = 0 | |||
| # If you're going to use this library with OpenMP, please comment it in. | |||
| # This flag is always set for POWER8. Don't modify the flag | |||
| # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8. | |||
| # USE_OPENMP = 1 | |||
| # The OpenMP scheduler to use - by default this is "static" and you | |||
| @@ -68,36 +70,45 @@ VERSION = 0.3.6.dev | |||
| # allow you to select the scheduler from the environment variable OMP_SCHEDULE | |||
| # CCOMMON_OPT += -DOMP_SCHED=dynamic | |||
| # You can define maximum number of threads. Basically it should be | |||
| # less than actual number of cores. If you don't specify one, it's | |||
| # automatically detected by the the script. | |||
| # You can define the maximum number of threads. Basically it should be less | |||
| # than or equal to the number of CPU threads. If you don't specify one, it's | |||
| # automatically detected by the build system. | |||
| # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to | |||
| # restrict NUM_THREADS to the number of physical cores. By default, the automatic | |||
| # detection includes logical CPUs, thus allowing the use of SMT. | |||
| # Users may opt at runtime to use less than NUM_THREADS threads. | |||
| # | |||
| # Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS | |||
| # value (eg. 32-256) if you expect your users to use that many threads. Due to the way | |||
| # some internal structures are allocated, using a large NUM_THREADS value has a RAM | |||
| # footprint penalty, even if users reduce the actual number of threads at runtime. | |||
| # NUM_THREADS = 24 | |||
| # If you have enabled USE_OPENMP and your application would call | |||
| # OpenBLAS's calculation API from multi threads, please comment it in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can | |||
| # actually run in parallel. If more threads call OpenBLAS's calculation API, | |||
| # OpenBLAS's calculation API from multiple threads, please comment this in. | |||
| # This flag defines how many instances of OpenBLAS's calculation API can actually | |||
| # run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API, | |||
| # they need to wait for the preceding API calls to finish or risk data corruption. | |||
| # NUM_PARALLEL = 2 | |||
| # if you don't need to install the static library, please comment it in. | |||
| # If you don't need to install the static library, please comment this in. | |||
| # NO_STATIC = 1 | |||
| # if you don't need generate the shared library, please comment it in. | |||
| # If you don't need to generate the shared library, please comment this in. | |||
| # NO_SHARED = 1 | |||
| # If you don't need CBLAS interface, please comment it in. | |||
| # If you don't need the CBLAS interface, please comment this in. | |||
| # NO_CBLAS = 1 | |||
| # If you only want CBLAS interface without installing Fortran compiler, | |||
| # please comment it in. | |||
| # If you only want the CBLAS interface without installing a Fortran compiler, | |||
| # please comment this in. | |||
| # ONLY_CBLAS = 1 | |||
| # If you don't need LAPACK, please comment it in. | |||
| # If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1. | |||
| # If you don't need LAPACK, please comment this in. | |||
| # If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1. | |||
| # NO_LAPACK = 1 | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment it in. | |||
| # If you don't need LAPACKE (C Interface to LAPACK), please comment this in. | |||
| # NO_LAPACKE = 1 | |||
| # Build LAPACK Deprecated functions since LAPACK 3.6.0 | |||
| @@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # Build RecursiveLAPACK on top of LAPACK | |||
| # BUILD_RELAPACK = 1 | |||
| # If you want to use legacy threaded Level 3 implementation. | |||
| # If you want to use the legacy threaded Level 3 implementation. | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # If you want to use the new, still somewhat experimental code that uses | |||
| @@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # USE_TLS = 1 | |||
| # If you want to drive whole 64bit region by BLAS. Not all Fortran | |||
| # compiler supports this. It's safe to keep comment it out if you | |||
| # are not sure(equivalent to "-i8" option). | |||
| # compilers support this. It's safe to keep this commented out if you | |||
| # are not sure. (This is equivalent to the "-i8" ifort option). | |||
| # INTERFACE64 = 1 | |||
| # Unfortunately most of kernel won't give us high quality buffer. | |||
| @@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1 | |||
| # but it will consume time. If you don't like it, you can disable one. | |||
| NO_WARMUP = 1 | |||
| # If you want to disable CPU/Memory affinity on Linux. | |||
| # Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling. | |||
| # This feature is only implemented on Linux, and is always disabled on other platforms. | |||
| # Enabling affinity handling may improve performance, especially on NUMA systems, but | |||
| # it may conflict with certain applications that also try to manage affinity. | |||
| # This conflict can result in threads of the application calling OpenBLAS ending up locked | |||
| # to the same core(s) as OpenBLAS, possibly binding all threads to a single core. | |||
| # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing | |||
| # else modifies affinity settings. | |||
| # Note: enabling affinity has been known to cause problems with NumPy and R | |||
| NO_AFFINITY = 1 | |||
| # if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus | |||
| # BIGNUMA = 1 | |||
| # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers | |||
| @@ -180,7 +199,7 @@ NO_AFFINITY = 1 | |||
| # been reported to be optimal for certain workloads (50 is the recommended value for Julia). | |||
| # GEMM_MULTITHREAD_THRESHOLD = 4 | |||
| # If you need santy check by comparing reference BLAS. It'll be very | |||
| # If you need sanity check by comparing results to reference BLAS. It'll be very | |||
| # slow (Not implemented yet). | |||
| # SANITY_CHECK = 1 | |||
| @@ -95,6 +95,9 @@ endif | |||
| ifeq ($(TARGET), ZEN) | |||
| GETARCH_FLAGS := -DFORCE_BARCELONA | |||
| endif | |||
| ifeq ($(TARGET), ARMV8) | |||
| GETARCH_FLAGS := -DFORCE_ARMV7 | |||
| endif | |||
| endif | |||
| @@ -152,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX | |||
| endif | |||
| ifeq ($(BINARY), 32) | |||
| GETARCH_FLAGS += -DNO_AVX | |||
| GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512 | |||
| NO_AVX512 = 1 | |||
| endif | |||
| ifeq ($(NO_AVX2), 1) | |||
| @@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector | |||
| FCOMMON_OPT += -march=z13 -mzvector | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| CCOMMON_OPT += -march=z14 -mzvector | |||
| FCOMMON_OPT += -march=z14 -mzvector | |||
| endif | |||
| @@ -91,7 +91,9 @@ CORTEXA73 | |||
| FALKOR | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| TSV110 | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| Z13 | |||
| Z14 | |||
| @@ -53,9 +53,9 @@ before_build: | |||
| - ps: if (-Not (Test-Path .\build)) { mkdir build } | |||
| - cd build | |||
| - if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
| build_script: | |||
| - cmake --build . | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,29 +28,21 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| A <- matrix(rnorm(n * n), nrow = n) | |||
| ev <- 0 | |||
| z <- system.time(for (l in 1:loops) { | |||
| ev <- eigen(A) | |||
| }) | |||
| mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,26 +28,13 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(runif(n * n), | |||
| ncol = n, | |||
| nrow = n, | |||
| byrow = TRUE) | |||
| B <- matrix(runif(n * n), | |||
| ncol = n, | |||
| nrow = n, | |||
| byrow = TRUE) | |||
| A <- matrix(runif(n * n), nrow = n) | |||
| B <- matrix(runif(n * n), nrow = n) | |||
| C <- 1 | |||
| z <- system.time(for (l in 1:loops) { | |||
| @@ -54,11 +42,10 @@ while (n <= nto) { | |||
| l <- l + 1 | |||
| }) | |||
| mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -2,6 +2,8 @@ | |||
| argv <- commandArgs(trailingOnly = TRUE) | |||
| if (!is.null(options("matprod")[[1]])) options(matprod = "blas") | |||
| nfrom <- 128 | |||
| nto <- 2048 | |||
| nstep <- 128 | |||
| @@ -19,7 +21,6 @@ if (length(argv) > 0) { | |||
| loops <- as.numeric(argv[z]) | |||
| } | |||
| } | |||
| } | |||
| p <- Sys.getenv("OPENBLAS_LOOPS") | |||
| @@ -27,31 +28,22 @@ if (p != "") { | |||
| loops <- as.numeric(p) | |||
| } | |||
| cat(sprintf( | |||
| "From %.0f To %.0f Step=%.0f Loops=%.0f\n", | |||
| nfrom, | |||
| nto, | |||
| nstep, | |||
| loops | |||
| )) | |||
| cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops)) | |||
| cat(sprintf(" SIZE Flops Time\n")) | |||
| n <- nfrom | |||
| while (n <= nto) { | |||
| A <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| B <- matrix(rnorm(n * n), ncol = n, nrow = n) | |||
| A <- matrix(rnorm(n * n), nrow = n) | |||
| B <- matrix(rnorm(n * n), nrow = n) | |||
| z <- system.time(for (l in 1:loops) { | |||
| solve(A, B) | |||
| }) | |||
| mflops <- | |||
| (2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6) | |||
| mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06) | |||
| st <- sprintf("%.0fx%.0f :", n, n) | |||
| cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3])) | |||
| n <- n + nstep | |||
| } | |||
| @@ -1,7 +1,7 @@ | |||
| #!/usr/bin/perl | |||
| use File::Basename; | |||
| use File::Temp qw(tempfile); | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| @@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| $hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/); | |||
| $hostarch = "zarch" if ($hostarch eq "s390x"); | |||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||
| #$tmpf = new File::Temp( UNLINK => 1 ); | |||
| $binary = $ENV{"BINARY"}; | |||
| $makefile = shift(@ARGV); | |||
| @@ -31,12 +31,25 @@ if ($?) { | |||
| $cross_suffix = ""; | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| eval "use File::Basename"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Basename, emulating its functionality"; | |||
| my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 ); | |||
| if ($dirnam ne ".") { | |||
| $cross_suffix .= $dirnam . "/"; | |||
| } | |||
| my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1); | |||
| if ($basnam =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } else { | |||
| if (dirname($compiler_name) ne ".") { | |||
| $cross_suffix .= dirname($compiler_name) . "/"; | |||
| } | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| if (basename($compiler_name) =~ /([^\s]*-)(.*)/) { | |||
| $cross_suffix .= $1; | |||
| } | |||
| } | |||
| $compiler = ""; | |||
| @@ -171,20 +184,26 @@ if ($?) { | |||
| $have_msa = 0; | |||
| if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check MSA capatibility"; | |||
| } else { | |||
| $have_msa = 1; | |||
| $tmpf = new File::Temp( UNLINK => 1 ); | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| $args = "$msa_flags -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $have_msa = 0; | |||
| } else { | |||
| $have_msa = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| @@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| $no_avx512= 0; | |||
| if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512"; | |||
| $no_avx512 = 0; | |||
| } else { | |||
| # $tmpf = new File::Temp( UNLINK => 1 ); | |||
| ($fh,$tmpf) = tempfile( UNLINK => 1 ); | |||
| $code = '"vbroadcastss -4 * 4(%rsi), %zmm2"'; | |||
| print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n"; | |||
| $args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf"; | |||
| my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $no_avx512 = 1; | |||
| } else { | |||
| $no_avx512 = 0; | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| unlink("tmpf.o"); | |||
| } | |||
| $data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| @@ -74,6 +74,9 @@ if (DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_CORE) | |||
| @@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| set(TARGET "BARCELONA") | |||
| endif () | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||
| set(TARGET "ARMV7") | |||
| endif () | |||
| endif () | |||
| if (DEFINED TARGET) | |||
| @@ -184,6 +187,13 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST") | |||
| foreach(DCORE ${DYNAMIC_LIST}) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}") | |||
| endforeach () | |||
| endif () | |||
| if (NO_LAPACK) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK") | |||
| #Disable LAPACK C interface | |||
| @@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| set(X86_64 1) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| set(X86_64 1) | |||
| else() | |||
| set(X86 1) | |||
| endif() | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*") | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)") | |||
| @@ -78,7 +82,7 @@ endif() | |||
| if (X86_64 OR X86) | |||
| file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512) | |||
| if (NO_AVX512 EQUAL 1) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| @@ -444,7 +444,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) 0 | |||
| #else | |||
| #ifdef OS_WINDOWS | |||
| #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
| typedef char env_var_t[MAX_PATH]; | |||
| #define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p)) | |||
| #else | |||
| @@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) ) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -598,9 +598,14 @@ REALNAME:;\ | |||
| #ifndef __64BIT__ | |||
| #define PROLOGUE \ | |||
| .machine "any";\ | |||
| .toc;\ | |||
| .globl .REALNAME;\ | |||
| .globl REALNAME;\ | |||
| .csect REALNAME[DS],3;\ | |||
| REALNAME:;\ | |||
| .long .REALNAME, TOC[tc0], 0;\ | |||
| .csect .text[PR],5;\ | |||
| .REALNAME:; | |||
| .REALNAME: | |||
| #define EPILOGUE \ | |||
| _section_.text:;\ | |||
| @@ -611,9 +616,14 @@ _section_.text:;\ | |||
| #define PROLOGUE \ | |||
| .machine "any";\ | |||
| .toc;\ | |||
| .globl .REALNAME;\ | |||
| .globl REALNAME;\ | |||
| .csect REALNAME[DS],3;\ | |||
| REALNAME:;\ | |||
| .llong .REALNAME, TOC[tc0], 0;\ | |||
| .csect .text[PR], 5;\ | |||
| .REALNAME:; | |||
| .REALNAME: | |||
| #define EPILOGUE \ | |||
| _section_.text:;\ | |||
| @@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y)); | |||
| return result; | |||
| #endif | |||
| @@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y)); | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||
| return result; | |||
| } | |||
| @@ -39,6 +39,8 @@ | |||
| // Cavium | |||
| #define CPU_THUNDERX 7 | |||
| #define CPU_THUNDERX2T99 8 | |||
| //Hisilicon | |||
| #define CPU_TSV110 9 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -49,7 +51,8 @@ static char *cpuname[] = { | |||
| "CORTEXA73", | |||
| "FALKOR", | |||
| "THUNDERX", | |||
| "THUNDERX2T99" | |||
| "THUNDERX2T99", | |||
| "TSV110" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -61,7 +64,8 @@ static char *cpuname_lower[] = { | |||
| "cortexa73", | |||
| "falkor", | |||
| "thunderx", | |||
| "thunderx2t99" | |||
| "thunderx2t99", | |||
| "tsv110" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -145,6 +149,9 @@ int detect(void) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
| return CPU_THUNDERX2T99; | |||
| // HiSilicon | |||
| else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||
| return CPU_TSV110; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -286,6 +293,21 @@ void get_cpuconfig(void) | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -228,7 +228,7 @@ int support_avx2(){ | |||
| } | |||
| int support_avx512(){ | |||
| #ifndef NO_AVX512 | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| @@ -1359,6 +1359,8 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| case 12: | |||
| // Apollo Lake | |||
| case 15: | |||
| // Denverton | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| @@ -1376,9 +1378,9 @@ int get_cpuname(void){ | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| case 8: | |||
| switch (model) { | |||
| case 14: // Kaby Lake | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| @@ -27,9 +27,9 @@ | |||
| #include <string.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| @@ -64,10 +64,8 @@ int detect(void) | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| /* detect z14, but fall back to z13 */ | |||
| if (strstr(p, "3906")) return CPU_Z13; | |||
| if (strstr(p, "3907")) return CPU_Z13; | |||
| if (strstr(p, "3906")) return CPU_Z14; | |||
| if (strstr(p, "3907")) return CPU_Z14; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -116,7 +114,14 @@ void get_cpuconfig(void) | |||
| break; | |||
| case CPU_Z14: | |||
| printf("#define Z14\n"); | |||
| printf("#define L1_DATA_SIZE 131072\n"); | |||
| printf("#define L1_DATA_LINESIZE 256\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8\n"); | |||
| printf("#define L2_SIZE 4194304\n"); | |||
| printf("#define L2_LINESIZE 256\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| } | |||
| @@ -113,7 +113,7 @@ ARCH_X86 | |||
| ARCH_X86_64 | |||
| #endif | |||
| #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) | |||
| #if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__) | |||
| ARCH_POWER | |||
| #endif | |||
| @@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu | |||
| range_m[num_cpu + 1] = range_m[num_cpu] + width; | |||
| range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16); | |||
| if (range_n[num_cpu] > m) range_n[num_cpu] = m; | |||
| if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu; | |||
| queue[num_cpu].mode = mode; | |||
| queue[num_cpu].routine = trmv_kernel; | |||
| @@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| SetEvent(pool.killed); | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| // Could also just use WaitForMultipleObjects | |||
| WaitForSingleObject(blas_threads[i], 5); //INFINITE); | |||
| #ifndef OS_WINDOWSSTORE | |||
| // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP | |||
| TerminateThread(blas_threads[i],0); | |||
| #endif | |||
| CloseHandle(blas_threads[i]); | |||
| } | |||
| CloseHandle(pool.filled); | |||
| CloseHandle(pool.killed); | |||
| blas_server_avail = 0; | |||
| } | |||
| @@ -322,7 +322,7 @@ int support_avx2(){ | |||
| } | |||
| int support_avx512(){ | |||
| #ifndef NO_AVX512 | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| @@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| //Apollo Lake | |||
| if (model == 12) { | |||
| //Apollo Lake or Denverton | |||
| if (model == 12 || model == 15) { | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| @@ -198,45 +198,68 @@ int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| cpu_set_t *cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| int i,n; | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int n; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return nums; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| #else | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) return nums; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) return nums; | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -1290,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) { | |||
| free(map_address); | |||
| } | |||
| #ifdef SMP | |||
| void blas_thread_memory_cleanup(void) { | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| } | |||
| #endif | |||
| void blas_shutdown(void){ | |||
| #ifdef SMP | |||
| BLASFUNC(blas_thread_shutdown)(); | |||
| @@ -1299,7 +1329,7 @@ void blas_shutdown(void){ | |||
| /* Only cleanupIf we were built for threading and TLS was initialized */ | |||
| if (local_storage_key) | |||
| #endif | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| blas_thread_memory_cleanup(); | |||
| #ifdef SEEK_ADDRESS | |||
| base_address = 0UL; | |||
| @@ -1529,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser | |||
| break; | |||
| case DLL_THREAD_DETACH: | |||
| #if defined(SMP) | |||
| blas_memory_cleanup((void*)get_memory_table()); | |||
| blas_thread_memory_cleanup(); | |||
| #endif | |||
| break; | |||
| case DLL_PROCESS_DETACH: | |||
| @@ -1603,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #endif | |||
| #else | |||
| /* USE_TLS / COMPILE_TLS not set */ | |||
| #include <errno.h> | |||
| #ifdef OS_WINDOWS | |||
| #if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT) | |||
| #define ALLOC_WINDOWS | |||
| #ifndef MEM_LARGE_PAGES | |||
| #define MEM_LARGE_PAGES 0x20000000 | |||
| @@ -1619,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #include <stdio.h> | |||
| #include <fcntl.h> | |||
| #ifndef OS_WINDOWS | |||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -1639,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #include <sys/resource.h> | |||
| #endif | |||
| #if defined(OS_FREEBSD) || defined(OS_DARWIN) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) | |||
| #include <sys/sysctl.h> | |||
| #include <sys/resource.h> | |||
| #endif | |||
| @@ -1678,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC) | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #else | |||
| #elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900)) | |||
| #define CONSTRUCTOR __attribute__ ((constructor(101))) | |||
| #define DESTRUCTOR __attribute__ ((destructor(101))) | |||
| #else | |||
| #define CONSTRUCTOR __attribute__ ((constructor)) | |||
| #define DESTRUCTOR __attribute__ ((destructor)) | |||
| #endif | |||
| #ifdef DYNAMIC_ARCH | |||
| @@ -1704,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {}; | |||
| int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| cpu_set_t *cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| int i,n; | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| int n; | |||
| #endif | |||
| #endif | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| return nums; | |||
| #else | |||
| #if !__GLIBC_PREREQ(2, 3) | |||
| return nums; | |||
| return nums; | |||
| #endif | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp); | |||
| ret = sched_getaffinity(0,sizeof(cpuset), &cpuset); | |||
| if (ret!=0) return nums; | |||
| n=0; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| for (i=0;i<nums;i++) | |||
| if (CPU_ISSET(i,cpusetp)) n++; | |||
| if (CPU_ISSET(i,cpuset)) n++; | |||
| nums=n; | |||
| #else | |||
| nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp); | |||
| nums = CPU_COUNT(sizeof(cpuset),&cpuset); | |||
| #endif | |||
| return nums; | |||
| #else | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) return nums; | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) return nums; | |||
| nums = CPU_COUNT_S(size,cpusetp); | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| if (nums >= CPU_SETSIZE) { | |||
| cpusetp = CPU_ALLOC(nums); | |||
| if (cpusetp == NULL) { | |||
| return nums; | |||
| } | |||
| size = CPU_ALLOC_SIZE(nums); | |||
| ret = sched_getaffinity(0,size,cpusetp); | |||
| if (ret!=0) { | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT_S(size,cpusetp); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| CPU_FREE(cpusetp); | |||
| return nums; | |||
| } else { | |||
| ret = sched_getaffinity(0,sizeof(cpuset),&cpuset); | |||
| if (ret!=0) { | |||
| return nums; | |||
| } | |||
| ret = CPU_COUNT(&cpuset); | |||
| if (ret > 0 && ret < nums) nums = ret; | |||
| return nums; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -1756,7 +1816,7 @@ int get_num_procs(void) { | |||
| return nums; | |||
| } | |||
| #endif | |||
| #ifdef OS_HAIKU | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| @@ -1793,7 +1853,7 @@ int get_num_procs(void) { | |||
| #endif | |||
| #if defined(OS_FREEBSD) | |||
| #if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) | |||
| int get_num_procs(void) { | |||
| @@ -1870,7 +1930,7 @@ void openblas_fork_handler() | |||
| // http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035 | |||
| // In the mean time build with USE_OPENMP=0 or link against another | |||
| // implementation of OpenMP. | |||
| #if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
| #if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER) | |||
| int err; | |||
| err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL); | |||
| if(err != 0) | |||
| @@ -1883,7 +1943,7 @@ extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -1891,11 +1951,11 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| blas_goto_num = 0; | |||
| // blas_goto_num = 0; | |||
| #ifndef USE_OPENMP | |||
| blas_goto_num=openblas_num_threads_env(); | |||
| if (blas_goto_num < 0) blas_goto_num = 0; | |||
| @@ -1907,7 +1967,7 @@ int blas_get_cpu_number(void){ | |||
| #endif | |||
| blas_omp_num = 0; | |||
| // blas_omp_num = 0; | |||
| blas_omp_num=openblas_omp_num_threads_env(); | |||
| if (blas_omp_num < 0) blas_omp_num = 0; | |||
| @@ -1915,7 +1975,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -2002,11 +2062,15 @@ static void *alloc_mmap(void *address){ | |||
| } | |||
| if (map_address != (void *)-1) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| release_pos ++; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| } | |||
| #ifdef OS_LINUX | |||
| @@ -2148,14 +2212,18 @@ static void *alloc_mmap(void *address){ | |||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | |||
| } | |||
| #endif | |||
| LOCK_COMMAND(&alloc_lock); | |||
| if (map_address != (void *)-1) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| release_pos ++; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| return map_address; | |||
| } | |||
| @@ -2523,7 +2591,7 @@ void *blas_memory_alloc(int procpos){ | |||
| int position; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| int mypos; | |||
| int mypos = 0; | |||
| #endif | |||
| void *map_address; | |||
| @@ -2554,6 +2622,11 @@ void *blas_memory_alloc(int procpos){ | |||
| NULL, | |||
| }; | |||
| void *(**func)(void *address); | |||
| #if defined(USE_OPENMP) | |||
| if (!memory_initialized) { | |||
| #endif | |||
| LOCK_COMMAND(&alloc_lock); | |||
| if (!memory_initialized) { | |||
| @@ -2589,6 +2662,9 @@ void *blas_memory_alloc(int procpos){ | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #if defined(USE_OPENMP) | |||
| } | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf("Alloc Start ...\n"); | |||
| @@ -2603,13 +2679,17 @@ void *blas_memory_alloc(int procpos){ | |||
| do { | |||
| if (!memory[position].used && (memory[position].pos == mypos)) { | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| // blas_lock(&memory[position].lock); | |||
| #else | |||
| blas_lock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].used) goto allocation; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| // blas_unlock(&memory[position].lock); | |||
| #else | |||
| blas_unlock(&memory[position].lock); | |||
| #endif | |||
| } | |||
| position ++; | |||
| @@ -2621,21 +2701,26 @@ void *blas_memory_alloc(int procpos){ | |||
| position = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| do { | |||
| /* if (!memory[position].used) { */ | |||
| /* blas_lock(&memory[position].lock);*/ | |||
| #if defined(USE_OPENMP) | |||
| if (!memory[position].used) { | |||
| blas_lock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].used) goto allocation; | |||
| /* blas_unlock(&memory[position].lock);*/ | |||
| /* } */ | |||
| #if defined(USE_OPENMP) | |||
| blas_unlock(&memory[position].lock); | |||
| } | |||
| #endif | |||
| position ++; | |||
| } while (position < NUM_BUFFERS); | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| goto error; | |||
| allocation : | |||
| @@ -2645,10 +2730,11 @@ void *blas_memory_alloc(int procpos){ | |||
| #endif | |||
| memory[position].used = 1; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| /* blas_unlock(&memory[position].lock);*/ | |||
| #else | |||
| blas_unlock(&memory[position].lock); | |||
| #endif | |||
| if (!memory[position].addr) { | |||
| do { | |||
| #ifdef DEBUG | |||
| @@ -2693,9 +2779,13 @@ void *blas_memory_alloc(int procpos){ | |||
| } while ((BLASLONG)map_address == -1); | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| memory[position].addr = map_address; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position); | |||
| @@ -2749,8 +2839,9 @@ void blas_memory_free(void *free_area){ | |||
| #endif | |||
| position = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | |||
| position++; | |||
| @@ -2764,7 +2855,9 @@ void blas_memory_free(void *free_area){ | |||
| WMB; | |||
| memory[position].used = 0; | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf("Unmap Succeeded.\n\n"); | |||
| @@ -2779,8 +2872,9 @@ void blas_memory_free(void *free_area){ | |||
| for (position = 0; position < NUM_BUFFERS; position++) | |||
| printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used); | |||
| #endif | |||
| #if defined(SMP) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| return; | |||
| } | |||
| @@ -141,6 +141,14 @@ else | |||
| $(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed | |||
| ../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c | |||
| endif | |||
| ifeq ($(F_COMPILER), INTEL) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else | |||
| ifneq ($(C_COMPILER), LSB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| @@ -152,6 +160,7 @@ else | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| endif | |||
| endif | |||
| rm -f linktest | |||
| @@ -40,15 +40,25 @@ | |||
| void gotoblas_init(void); | |||
| void gotoblas_quit(void); | |||
| #if defined(SMP) && defined(USE_TLS) | |||
| void blas_thread_memory_cleanup(void); | |||
| #endif | |||
| BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) { | |||
| if (reason == DLL_PROCESS_ATTACH) { | |||
| gotoblas_init(); | |||
| } | |||
| if (reason == DLL_PROCESS_DETACH) { | |||
| gotoblas_quit(); | |||
| switch(reason) { | |||
| case DLL_PROCESS_ATTACH: | |||
| gotoblas_init(); | |||
| break; | |||
| case DLL_PROCESS_DETACH: | |||
| gotoblas_quit(); | |||
| break; | |||
| case DLL_THREAD_ATTACH: | |||
| break; | |||
| case DLL_THREAD_DETACH: | |||
| #if defined(SMP) && defined(USE_TLS) | |||
| blas_thread_memory_cleanup(); | |||
| #endif | |||
| break; | |||
| } | |||
| return TRUE; | |||
| @@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <unistd.h> | |||
| #endif | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #else | |||
| #define NO_AVX512 | |||
| #endif | |||
| /* #define FORCE_P2 */ | |||
| /* #define FORCE_KATMAI */ | |||
| /* #define FORCE_COPPERMINE */ | |||
| @@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #ifdef FORCE_SKYLAKEX | |||
| #ifdef NO_AVX512 | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #else | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| @@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define LIBNAME "skylakex" | |||
| #define CORENAME "SKYLAKEX" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| @@ -1058,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_TSV110 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "TSV110" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTSV110 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \ | |||
| "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "tsv110" | |||
| #define CORENAME "TSV110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -1078,6 +1114,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "Z13" | |||
| #endif | |||
| #ifdef FORCE_Z14 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| #define SUBARCHITECTURE "Z14" | |||
| #define ARCHCONFIG "-DZ14 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64" | |||
| #define LIBNAME "z14" | |||
| #define CORENAME "Z14" | |||
| #endif | |||
| #ifndef FORCE | |||
| #ifdef USER_TARGET | |||
| @@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| buffer = (FLOAT *)blas_memory_alloc(1); | |||
| #ifdef SMP | |||
| /* nthreads = num_cpu_avail(2); | |||
| nthreads = num_cpu_avail(2); | |||
| FIXME trmv_thread was found to be broken, see issue 1332 */ | |||
| nthreads = 1; | |||
| if (nthreads == 1) { | |||
| #endif | |||
| @@ -81,6 +81,12 @@ | |||
| #endif | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define SMP_FACTOR 256 | |||
| #else | |||
| #define SMP_FACTOR 128 | |||
| #endif | |||
| static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
| #ifndef TRMM | |||
| TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN, | |||
| @@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order, | |||
| mode |= (trans << BLAS_TRANSA_SHIFT); | |||
| mode |= (side << BLAS_RSIDE_SHIFT); | |||
| if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| /* | |||
| if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| else | |||
| if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD ) | |||
| if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) | |||
| args.nthreads = 1; | |||
| */ | |||
| if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD) | |||
| args.nthreads = 1; | |||
| else | |||
| args.nthreads = num_cpu_avail(3); | |||
| @@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, | |||
| } else | |||
| nthreads = 1; | |||
| /* FIXME TRMV multithreading appears to be broken, see issue 1332*/ | |||
| nthreads = 1; | |||
| if(nthreads > 1) { | |||
| buffer_size = n > 16 ? 0 : n * 4 + 40; | |||
| } | |||
| @@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(TARGET), GENERIC) | |||
| ifeq ($(CORE), GENERIC) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| while(i < n) | |||
| { | |||
| if( x[ix] > minf ) | |||
| if( x[ix] < minf ) | |||
| { | |||
| min = i; | |||
| minf = x[ix]; | |||
| @@ -0,0 +1,175 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRMMKERNEL = ../generic/trmmkernel_4x4.c | |||
| DTRMMKERNEL = ../generic/trmmkernel_2x2.c | |||
| CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SDOTKERNEL = dot.S | |||
| DDOTKERNEL = dot.S | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) | |||
| while(i < n) | |||
| { | |||
| if( x[ix] > minf ) | |||
| if( x[ix] < minf ) | |||
| { | |||
| min = i; | |||
| minf = x[ix]; | |||
| @@ -129,7 +129,7 @@ LL(12): | |||
| STFD f0, 14 * SIZE(CO1) | |||
| STFD f0, 15 * SIZE(CO1) | |||
| dcbst PRE, CO1 | |||
| dcbtst PRE, CO1 | |||
| addi CO1, CO1, 16 * SIZE | |||
| bdnz LL(12) | |||
| .align 4 | |||
| @@ -134,7 +134,7 @@ LL(12): | |||
| STFD f0, 14 * SIZE(CO1) | |||
| STFD f0, 15 * SIZE(CO1) | |||
| dcbst PRE, CO1 | |||
| dcbtst PRE, CO1 | |||
| addi CO1, CO1, 16 * SIZE | |||
| bdnz LL(12) | |||
| .align 4 | |||
| @@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha), // 4 | |||
| @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovups %%xmm4, 16(%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"0", "1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", // "0", "1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| @@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| @@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| : | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"0", "1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"0", "1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "r" (alpha) // 2 | |||
| : "cc", //"0", "1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"%0", "%1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha) // 2 | |||
| : "cc", //"0", "1", | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| @@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "movsd %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovsd %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| @@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "jnz 1b \n\t" | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| @@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| @@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastsd (%3), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 | |||
| "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 | |||
| "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 | |||
| "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 | |||
| "vbroadcastsd (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y | |||
| "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %2 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| @@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "addq $8 , %8 \n\t" | |||
| "addq $8 , %2 \n\t" | |||
| "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y | |||
| "subq $8 , %1 \n\t" | |||
| "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y | |||
| "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y | |||
| "jnz 1b \n\t" | |||
| @@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| "+r" (n), // 1 | |||
| "+r" (lda4) // 2 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (x), // 3 | |||
| "r" (y), // 4 | |||
| "r" (ap[0]), // 5 | |||
| "r" (ap[1]), // 6 | |||
| "r" (ap[2]), // 7 | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| @@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "movsd %%xmm11,8(%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (y), // 2 | |||
| "r" (ap0), // 3 | |||
| "r" (ap1), // 4 | |||
| @@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "movsd %%xmm10, (%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (y), // 2 | |||
| "r" (ap), // 3 | |||
| "r" (x) // 4 | |||
| @@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (&da), // 2 | |||
| "r" (src), // 3 | |||
| "r" (dest) // 4 | |||
| @@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ | |||
| "jnz 1b \n\t" | |||
| : | |||
| "+r" (n) // 0 | |||
| : | |||
| "r" (n), // 0 | |||
| "r" (x), // 1 | |||
| "r" (x1), // 2 | |||
| "r" (alpha), // 3 | |||
| @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) | |||
| "vzeroupper \n\t" | |||
| : | |||
| "+r" (n1), // 0 | |||
| "+r" (x) // 1 | |||
| : | |||
| : | |||
| "r" (n1), // 0 | |||
| "r" (x), // 1 | |||
| "r" (alpha), // 2 | |||
| "r" (n2) // 3 | |||
| : "cc", | |||
| @@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "+r" (from) // 0 | |||
| : | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| @@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "+r" (from) // 0 | |||
| : | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| @@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "movsd %%xmm3 , 24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "+r" (from) // 0 | |||
| : | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| @@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (from), // 0 | |||
| "+r" (from) // 0 | |||
| : | |||
| "r" (to), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| @@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| @@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| @@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "movsd %%xmm3 , 24(%9) \n\t" // save temp2 | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| @@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (a0), // 4 | |||
| @@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " cmpq $0, %0 \n\t" | |||
| " je 4f \n\t" | |||
| " vmovups (%2,%1,4), %%ymm0 \n\t" // read a | |||
| " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 | |||
| " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 | |||
| " vmovups (%8,%1,4), %%ymm0 \n\t" // read a | |||
| " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 | |||
| " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 | |||
| " addq $8, %1 \n\t" | |||
| @@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " .p2align 4 \n\t" | |||
| "1: \n\t" | |||
| " vmovups (%2,%1,4), %%ymm4 \n\t" // read a | |||
| " vmovups (%8,%1,4), %%ymm4 \n\t" // read a | |||
| " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" | |||
| " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" | |||
| " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" | |||
| " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 | |||
| " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 | |||
| " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" | |||
| " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" | |||
| " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" | |||
| " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 | |||
| " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 | |||
| " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" | |||
| " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" | |||
| " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" | |||
| @@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " jz 22f \n\t" | |||
| " vmovups (%2,%1,4), %%ymm0 \n\t" // read a | |||
| " vmovups (%8,%1,4), %%ymm0 \n\t" // read a | |||
| " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" | |||
| " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" | |||
| " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" | |||
| " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 | |||
| " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 | |||
| " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" | |||
| " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" | |||
| " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" | |||
| " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 | |||
| " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 | |||
| " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" | |||
| " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" | |||
| @@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 | |||
| " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" | |||
| " vmovups (%9), %%ymm0 \n\t" | |||
| " vmovups (%3), %%ymm0 \n\t" | |||
| " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" | |||
| " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" | |||
| " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" | |||
| @@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" | |||
| " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" | |||
| " vmovups 32(%9), %%ymm4 \n\t" | |||
| " vmovups 32(%3), %%ymm4 \n\t" | |||
| " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" | |||
| " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" | |||
| " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" | |||
| @@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| "5: \n\t" // i = 0 | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb | |||
| " vmovups (%9), %%ymm0 \n\t" | |||
| " vmovups %%ymm8 , (%8) \n\t" // write a | |||
| " vmovups (%3), %%ymm0 \n\t" | |||
| " vmovups %%ymm8 , (%2) \n\t" // write a | |||
| " vmovups %%ymm8 , (%4) \n\t" // write c | |||
| " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" | |||
| " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" | |||
| " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" | |||
| @@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" | |||
| " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb | |||
| " vmovups (%9), %%ymm0 \n\t" | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm9 , (%8) \n\t" // write a | |||
| " vmovups (%3), %%ymm0 \n\t" | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm9 , (%2) \n\t" // write a | |||
| " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c | |||
| " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" | |||
| @@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" | |||
| " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb | |||
| " vmovups (%9), %%ymm0 \n\t" | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm10, (%8) \n\t" // write a | |||
| " vmovups (%3), %%ymm0 \n\t" | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm10, (%2) \n\t" // write a | |||
| " vmovups %%ymm10, (%4,%7,2) \n\t" // write c | |||
| " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" | |||
| @@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm11, (%8) \n\t" // write a | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm11, (%2) \n\t" // write a | |||
| " vmovups %%ymm11, (%5) \n\t" // write c | |||
| " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" | |||
| @@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm12, (%8) \n\t" // write a | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm12, (%2) \n\t" // write a | |||
| " vmovups %%ymm12, (%5,%7,1) \n\t" // write c | |||
| " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" | |||
| @@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" | |||
| " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm13, (%8) \n\t" // write a | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm13, (%2) \n\t" // write a | |||
| " vmovups %%ymm13, (%5,%7,2) \n\t" // write c | |||
| " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" | |||
| @@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" | |||
| " addq $64, %9 \n\t" // b=b+8 | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $64, %3 \n\t" // b=b+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb | |||
| " vmovups 32(%9), %%ymm1 \n\t" | |||
| " vmovups %%ymm14, (%8) \n\t" // write a | |||
| " vmovups 32(%3), %%ymm1 \n\t" | |||
| " vmovups %%ymm14, (%2) \n\t" // write a | |||
| " vmovups %%ymm14, (%6) \n\t" // write c | |||
| " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" | |||
| " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" | |||
| " addq $32, %8 \n\t" // a=a+8 | |||
| " addq $32, %2 \n\t" // a=a+8 | |||
| " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb | |||
| " vmovups %%ymm15, (%8) \n\t" // write a | |||
| " vmovups %%ymm15, (%2) \n\t" // write a | |||
| " vmovups %%ymm15, (%6,%7,1) \n\t" // write c | |||
| " vzeroupper \n\t" | |||
| : | |||
| "+r" (n1), // 0 | |||
| "+a" (i), // 1 | |||
| "+r" (as), // 2 | |||
| "+r" (bs) // 3 | |||
| : | |||
| "r" (n1), // 0 | |||
| "a" (i), // 1 | |||
| "r" (a), // 2 | |||
| "r" (b), // 3 | |||
| "r" (c), // 4 | |||
| "r" (c3), // 5 | |||
| "r" (c6), // 6 | |||
| "r" (ldc), // 7 | |||
| "r" (as), // 8 | |||
| "r" (bs) // 9 | |||
| "r" (a), // 8 | |||
| "r" (b) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| @@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " .align 16 \n\t" | |||
| "1: \n\t" | |||
| " prefetcht0 384(%2,%1,8) \n\t" | |||
| " prefetcht0 384(%3,%1,8) \n\t" | |||
| " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%2,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%3,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%2,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%2,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%2,%1,8), %%xmm7 \n\t" | |||
| " prefetcht0 384(%6,%1,8) \n\t" | |||
| " prefetcht0 384(%7,%1,8) \n\t" | |||
| " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%6,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%7,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%6,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%6,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%6,%1,8), %%xmm7 \n\t" | |||
| " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" | |||
| " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||
| @@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " jz 2f \n\t" | |||
| " prefetcht0 384(%2,%1,8) \n\t" | |||
| " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%2,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%3,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%2,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%2,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%2,%1,8), %%xmm7 \n\t" | |||
| " prefetcht0 384(%6,%1,8) \n\t" | |||
| " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%6,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%7,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%6,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%6,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%6,%1,8), %%xmm7 \n\t" | |||
| " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" | |||
| " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||
| @@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " jz 2f \n\t" | |||
| " prefetcht0 384(%2,%1,8) \n\t" | |||
| " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%2,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%3,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%2,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%2,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%2,%1,8), %%xmm7 \n\t" | |||
| " prefetcht0 384(%6,%1,8) \n\t" | |||
| " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovups (%6,%1,8), %%xmm4 \n\t" | |||
| " vmovddup 8(%7,%1,2), %%xmm1 \n\t" | |||
| " vmovups 16(%6,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%6,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%6,%1,8), %%xmm7 \n\t" | |||
| " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" | |||
| " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||
| @@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " jz 2f \n\t" | |||
| " prefetcht0 384(%2,%1,8) \n\t" | |||
| " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovddup 8(%3,%1,2), %%xmm1 \n\t" | |||
| " vmovups (%2,%1,8), %%xmm4 \n\t" | |||
| " vmovups 16(%2,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%2,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%2,%1,8), %%xmm7 \n\t" | |||
| " prefetcht0 384(%6,%1,8) \n\t" | |||
| " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b | |||
| " vmovddup 8(%7,%1,2), %%xmm1 \n\t" | |||
| " vmovups (%6,%1,8), %%xmm4 \n\t" | |||
| " vmovups 16(%6,%1,8), %%xmm5 \n\t" | |||
| " vmovups 32(%6,%1,8), %%xmm6 \n\t" | |||
| " vmovups 48(%6,%1,8), %%xmm7 \n\t" | |||
| " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" | |||
| " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" | |||
| @@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| "3: \n\t" // i = 1 | |||
| " vmovddup (%7), %%xmm1 \n\t" // read b | |||
| " vmovddup 8(%7), %%xmm0 \n\t" // read bb | |||
| " vmovddup (%3), %%xmm1 \n\t" // read b | |||
| " vmovddup 8(%3), %%xmm0 \n\t" // read bb | |||
| " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb | |||
| " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb | |||
| " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb | |||
| " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb | |||
| " vmovups %%xmm12 , (%6) \n\t" // write a | |||
| " vmovups %%xmm13 , 16(%6) \n\t" // write a | |||
| " vmovups %%xmm14 , 32(%6) \n\t" // write a | |||
| " vmovups %%xmm15 , 48(%6) \n\t" // write a | |||
| " vmovups %%xmm12 , (%2) \n\t" // write a | |||
| " vmovups %%xmm13 , 16(%2) \n\t" // write a | |||
| " vmovups %%xmm14 , 32(%2) \n\t" // write a | |||
| " vmovups %%xmm15 , 48(%2) \n\t" // write a | |||
| " vmovups %%xmm12 , (%5) \n\t" // write c1 | |||
| " vmovups %%xmm13 , 16(%5) \n\t" | |||
| @@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" | |||
| " \n\t" // i = 0 | |||
| " subq $16 , %7 \n\t" // b = b - 2 | |||
| " subq $64 , %6 \n\t" // a = a - 8 | |||
| " subq $16 , %3 \n\t" // b = b - 2 | |||
| " subq $64 , %2 \n\t" // a = a - 8 | |||
| " vmovddup (%7), %%xmm0 \n\t" // read bb | |||
| " vmovddup (%3), %%xmm0 \n\t" // read bb | |||
| " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb | |||
| " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" | |||
| " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" | |||
| " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" | |||
| " vmovups %%xmm8 , (%6) \n\t" // write a | |||
| " vmovups %%xmm9 , 16(%6) \n\t" | |||
| " vmovups %%xmm10 , 32(%6) \n\t" | |||
| " vmovups %%xmm11 , 48(%6) \n\t" | |||
| " vmovups %%xmm8 , (%2) \n\t" // write a | |||
| " vmovups %%xmm9 , 16(%2) \n\t" | |||
| " vmovups %%xmm10 , 32(%2) \n\t" | |||
| " vmovups %%xmm11 , 48(%2) \n\t" | |||
| " vmovups %%xmm8 , (%4) \n\t" // write c0 | |||
| " vmovups %%xmm9 , 16(%4) \n\t" | |||
| @@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON | |||
| " vzeroupper \n\t" | |||
| : | |||
| "+r" (n1), // 0 | |||
| "+a" (i), // 1 | |||
| "+r" (as), // 2 | |||
| "+r" (bs) // 3 | |||
| : | |||
| "r" (n1), // 0 | |||
| "a" (i), // 1 | |||
| "r" (a), // 2 | |||
| "r" (b), // 3 | |||
| "r" (c), // 4 | |||
| "r" (c1), // 5 | |||
| "r" (as), // 6 | |||
| "r" (bs) // 7 | |||
| "r" (a), // 6 | |||
| "r" (b) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| @@ -135,7 +135,7 @@ | |||
| #endif | |||
| movq %rsp, %rbx # save old stack | |||
| subq $128 + LOCAL_BUFFER_SIZE, %rsp | |||
| subq $256 + LOCAL_BUFFER_SIZE, %rsp | |||
| andq $-4096, %rsp # align stack | |||
| STACK_TOUCHING | |||
| @@ -383,7 +383,7 @@ | |||
| EMMS | |||
| movq %rsp, %rbx # save old stack | |||
| subq $128 + LOCAL_BUFFER_SIZE, %rsp | |||
| subq $256 + LOCAL_BUFFER_SIZE, %rsp | |||
| andq $-4096, %rsp # align stack | |||
| STACK_TOUCHING | |||
| @@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "jnz 1b \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (alpha) // 4 | |||
| @@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "movss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) | |||
| "vmovss %%xmm4, (%4) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (dot) // 4 | |||
| @@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| @@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a | |||
| "3: \n\t" | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n1) // 1 | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| @@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| "jnz 1b \n\t" | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (src), // 2 | |||
| "r" (dest) // 3 | |||
| : "cc", | |||
| @@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%2), %%xmm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 | |||
| "vbroadcastss (%3), %%xmm12 \n\t" // x0 | |||
| "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 | |||
| "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 | |||
| "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 | |||
| "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 | |||
| "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 | |||
| "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 | |||
| "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%xmm8 \n\t" // alpha | |||
| @@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "addq $4 , %8 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" | |||
| "addq $4 , %2 \n\t" | |||
| "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" | |||
| "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" | |||
| "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y | |||
| "2: \n\t" | |||
| @@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y | |||
| "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y | |||
| "addq $8 , %0 \n\t" | |||
| "addq $8 , %8 \n\t" | |||
| "addq $8 , %2 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| @@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "prefetcht0 192(%7,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "prefetcht0 192(%8,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| ".align 2 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" | |||
| "prefetcht0 192(%4,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%5,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%6,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%7,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" | |||
| "prefetcht0 192(%5,%2,4) \n\t" | |||
| "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%6,%2,4) \n\t" | |||
| "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%7,%2,4) \n\t" | |||
| "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%8,%2,4) \n\t" | |||
| "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" | |||
| "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" | |||
| "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" | |||
| "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" | |||
| "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y | |||
| "addq $16, %8 \n\t" | |||
| "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y | |||
| "addq $16, %2 \n\t" | |||
| "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| @@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| "+r" (n), // 1 | |||
| "+r" (lda4) // 2 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (x), // 3 | |||
| "r" (y), // 4 | |||
| "r" (ap[0]), // 5 | |||
| "r" (ap[1]), // 6 | |||
| "r" (ap[2]), // 7 | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| @@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| @@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%3), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" | |||
| "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" | |||
| "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %2 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| @@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "testq $0x08, %1 \n\t" | |||
| "jz 3f \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y | |||
| "addq $8 , %8 \n\t" | |||
| "addq $8 , %2 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| @@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y | |||
| "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" | |||
| "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y | |||
| "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y | |||
| "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "addq $16, %8 \n\t" | |||
| "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y | |||
| "addq $16, %2 \n\t" | |||
| "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y | |||
| "subq $16, %1 \n\t" | |||
| "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y | |||
| "jnz 1b \n\t" | |||
| @@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| "+r" (n), // 1 | |||
| "+r" (lda4) // 2 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (x), // 3 | |||
| "r" (y), // 4 | |||
| "r" (ap[0]), // 5 | |||
| "r" (ap[1]), // 6 | |||
| "r" (ap[2]), // 7 | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| @@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| @@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT | |||
| "vbroadcastss (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz 2f \n\t" | |||
| @@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2), %%xmm12 \n\t" // x0 | |||
| "movss 4(%2), %%xmm13 \n\t" // x1 | |||
| "movss 8(%2), %%xmm14 \n\t" // x2 | |||
| "movss 12(%2), %%xmm15 \n\t" // x3 | |||
| "movss (%3), %%xmm12 \n\t" // x0 | |||
| "movss 4(%3), %%xmm13 \n\t" // x1 | |||
| "movss 8(%3), %%xmm14 \n\t" // x2 | |||
| "movss 12(%3), %%xmm15 \n\t" // x3 | |||
| "shufps $0, %%xmm12, %%xmm12\n\t" | |||
| "shufps $0, %%xmm13, %%xmm13\n\t" | |||
| "shufps $0, %%xmm14, %%xmm14\n\t" | |||
| "shufps $0, %%xmm15, %%xmm15\n\t" | |||
| "movss 16(%2), %%xmm0 \n\t" // x4 | |||
| "movss 20(%2), %%xmm1 \n\t" // x5 | |||
| "movss 24(%2), %%xmm2 \n\t" // x6 | |||
| "movss 28(%2), %%xmm3 \n\t" // x7 | |||
| "movss 16(%3), %%xmm0 \n\t" // x4 | |||
| "movss 20(%3), %%xmm1 \n\t" // x5 | |||
| "movss 24(%3), %%xmm2 \n\t" // x6 | |||
| "movss 28(%3), %%xmm3 \n\t" // x7 | |||
| "shufps $0, %%xmm0 , %%xmm0 \n\t" | |||
| "shufps $0, %%xmm1 , %%xmm1 \n\t" | |||
| "shufps $0, %%xmm2 , %%xmm2 \n\t" | |||
| @@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "1: \n\t" | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "xorps %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y | |||
| ".p2align 1 \n\t" | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| "movups (%5,%0,4), %%xmm9 \n\t" | |||
| "movups (%6,%0,4), %%xmm10 \n\t" | |||
| "movups (%7,%0,4), %%xmm11 \n\t" | |||
| "movups (%5,%0,4), %%xmm8 \n\t" | |||
| "movups (%6,%0,4), %%xmm9 \n\t" | |||
| "movups (%7,%0,4), %%xmm10 \n\t" | |||
| "movups (%8,%0,4), %%xmm11 \n\t" | |||
| ".p2align 1 \n\t" | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm13, %%xmm9 \n\t" | |||
| @@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addps %%xmm10, %%xmm4 \n\t" | |||
| "addps %%xmm11, %%xmm5 \n\t" | |||
| "movups (%4,%8,4), %%xmm8 \n\t" | |||
| "movups (%5,%8,4), %%xmm9 \n\t" | |||
| "movups (%6,%8,4), %%xmm10 \n\t" | |||
| "movups (%7,%8,4), %%xmm11 \n\t" | |||
| "movups (%5,%2,4), %%xmm8 \n\t" | |||
| "movups (%6,%2,4), %%xmm9 \n\t" | |||
| "movups (%7,%2,4), %%xmm10 \n\t" | |||
| "movups (%8,%2,4), %%xmm11 \n\t" | |||
| ".p2align 1 \n\t" | |||
| "mulps %%xmm0 , %%xmm8 \n\t" | |||
| "mulps %%xmm1 , %%xmm9 \n\t" | |||
| @@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "addps %%xmm10, %%xmm4 \n\t" | |||
| "addps %%xmm11, %%xmm5 \n\t" | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %2 \n\t" | |||
| "addps %%xmm5 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "mulps %%xmm6 , %%xmm4 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm4 , %%xmm7 \n\t" | |||
| "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y | |||
| "jnz 1b \n\t" | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| "+r" (n), // 1 | |||
| "+r" (lda4) // 2 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (x), // 3 | |||
| "r" (y), // 4 | |||
| "r" (ap[0]), // 5 | |||
| "r" (ap[1]), // 6 | |||
| "r" (ap[2]), // 7 | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| @@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%3), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| @@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" | |||
| "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" | |||
| "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" | |||
| "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" | |||
| "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" | |||
| "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" | |||
| "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" | |||
| "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" | |||
| "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" | |||
| "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" | |||
| "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" | |||
| "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" | |||
| "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" | |||
| "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" | |||
| "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" | |||
| "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" | |||
| @@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" | |||
| "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y | |||
| "addq $4, %8 \n\t" | |||
| "addq $4, %2 \n\t" | |||
| "addq $4, %0 \n\t" | |||
| "subq $4, %1 \n\t" | |||
| @@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" | |||
| "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" | |||
| "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" | |||
| "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" | |||
| @@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" | |||
| "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y | |||
| "addq $8, %8 \n\t" | |||
| "addq $8, %2 \n\t" | |||
| "addq $8, %0 \n\t" | |||
| "subq $8, %1 \n\t" | |||
| @@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" | |||
| "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "prefetcht0 192(%7,%0,4) \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" | |||
| "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" | |||
| "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "prefetcht0 192(%8,%0,4) \n\t" | |||
| "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" | |||
| "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%4,%8,4) \n\t" | |||
| "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%5,%8,4) \n\t" | |||
| "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" | |||
| "prefetcht0 192(%5,%2,4) \n\t" | |||
| "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%6,%2,4) \n\t" | |||
| "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%6,%8,4) \n\t" | |||
| "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" | |||
| "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%7,%8,4) \n\t" | |||
| "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" | |||
| "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" | |||
| "prefetcht0 192(%7,%2,4) \n\t" | |||
| "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" | |||
| "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%8,%2,4) \n\t" | |||
| "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" | |||
| "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| @@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" | |||
| "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y | |||
| "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y | |||
| "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y | |||
| "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y | |||
| "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y | |||
| "addq $16, %8 \n\t" | |||
| "addq $16, %2 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz 1b \n\t" | |||
| @@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO | |||
| : | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| "+r" (n), // 1 | |||
| "+r" (lda4) // 2 | |||
| : | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (x), // 3 | |||
| "r" (y), // 4 | |||
| "r" (ap[0]), // 5 | |||
| "r" (ap[1]), // 6 | |||
| "r" (ap[2]), // 7 | |||
| "r" (ap[3]), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| @@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT | |||
| "movss %%xmm11,4(%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (y), // 2 | |||
| "r" (ap0), // 3 | |||
| "r" (ap1), // 4 | |||
| @@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| "movss %%xmm10, (%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (y), // 2 | |||
| "r" (ap), // 3 | |||
| "r" (x) // 4 | |||
| @@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d | |||
| "jnz 1b \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "+r" (i), // 0 | |||
| "+r" (n) // 1 | |||
| : | |||
| "r" (&da), // 2 | |||
| "r" (src), // 3 | |||
| "r" (dest) // 4 | |||