Browse Source

Merge branch 'develop' of https://github.com/quickwritereader/OpenBLAS into develop

tags/v0.3.7
AbdelRauf 6 years ago
parent
commit
628b335e83
100 changed files with 1350 additions and 897 deletions
  1. +2
    -1
      .travis.yml
  2. +29
    -13
      CMakeLists.txt
  3. +1
    -1
      Makefile
  4. +5
    -0
      Makefile.arm64
  5. +5
    -5
      Makefile.install
  6. +40
    -21
      Makefile.rule
  7. +5
    -1
      Makefile.system
  8. +4
    -0
      Makefile.zarch
  9. +2
    -0
      TargetList.txt
  10. +2
    -2
      appveyor.yml
  11. +5
    -12
      benchmark/scripts/R/deig.R
  12. +6
    -19
      benchmark/scripts/R/dgemm.R
  13. +6
    -14
      benchmark/scripts/R/dsolve.R
  14. +56
    -29
      c_check
  15. +3
    -0
      cmake/arch.cmake
  16. +10
    -0
      cmake/system.cmake
  17. +6
    -2
      cmake/system_check.cmake
  18. +1
    -1
      common.h
  19. +13
    -3
      common_power.h
  20. +1
    -1
      common_x86.h
  21. +1
    -1
      common_x86_64.h
  22. +24
    -2
      cpuid_arm64.c
  23. +5
    -3
      cpuid_x86.c
  24. +12
    -7
      cpuid_zarch.c
  25. +1
    -1
      ctest.c
  26. +2
    -2
      driver/level2/trmv_thread.c
  27. +5
    -0
      driver/others/blas_server_win32.c
  28. +3
    -3
      driver/others/dynamic.c
  29. +165
    -71
      driver/others/memory.c
  30. +9
    -0
      exports/Makefile
  31. +17
    -7
      exports/dllinit.c
  32. +46
    -0
      getarch.c
  33. +1
    -4
      interface/trmv.c
  34. +12
    -2
      interface/trsm.c
  35. +0
    -3
      interface/ztrmv.c
  36. +5
    -1
      kernel/Makefile.L3
  37. +1
    -1
      kernel/arm/imin.c
  38. +175
    -0
      kernel/arm64/KERNEL.TSV110
  39. +1
    -1
      kernel/mips/imin.c
  40. +1
    -1
      kernel/power/gemm_beta.S
  41. +1
    -1
      kernel/power/zgemm_beta.S
  42. +7
    -7
      kernel/x86_64/caxpy_microk_bulldozer-2.c
  43. +3
    -3
      kernel/x86_64/caxpy_microk_haswell-2.c
  44. +4
    -4
      kernel/x86_64/caxpy_microk_sandy-2.c
  45. +7
    -7
      kernel/x86_64/caxpy_microk_steamroller-2.c
  46. +7
    -7
      kernel/x86_64/cdot_microk_bulldozer-2.c
  47. +3
    -3
      kernel/x86_64/cdot_microk_haswell-2.c
  48. +4
    -4
      kernel/x86_64/cdot_microk_sandy-2.c
  49. +7
    -7
      kernel/x86_64/cdot_microk_steamroller-2.c
  50. +16
    -16
      kernel/x86_64/cscal_microk_bulldozer-2.c
  51. +15
    -15
      kernel/x86_64/cscal_microk_haswell-2.c
  52. +16
    -16
      kernel/x86_64/cscal_microk_steamroller-2.c
  53. +3
    -3
      kernel/x86_64/daxpy_microk_bulldozer-2.c
  54. +4
    -4
      kernel/x86_64/daxpy_microk_haswell-2.c
  55. +3
    -3
      kernel/x86_64/daxpy_microk_nehalem-2.c
  56. +8
    -8
      kernel/x86_64/daxpy_microk_piledriver-2.c
  57. +4
    -4
      kernel/x86_64/daxpy_microk_sandy-2.c
  58. +8
    -8
      kernel/x86_64/daxpy_microk_steamroller-2.c
  59. +4
    -4
      kernel/x86_64/ddot_microk_bulldozer-2.c
  60. +3
    -3
      kernel/x86_64/ddot_microk_haswell-2.c
  61. +4
    -4
      kernel/x86_64/ddot_microk_nehalem-2.c
  62. +8
    -8
      kernel/x86_64/ddot_microk_piledriver-2.c
  63. +4
    -4
      kernel/x86_64/ddot_microk_sandy-2.c
  64. +4
    -4
      kernel/x86_64/ddot_microk_steamroller-2.c
  65. +5
    -5
      kernel/x86_64/dgemv_n_4.c
  66. +1
    -0
      kernel/x86_64/dgemv_n_microk_haswell-4.c
  67. +49
    -49
      kernel/x86_64/dgemv_n_microk_piledriver-4.c
  68. +9
    -9
      kernel/x86_64/dgemv_t_4.c
  69. +3
    -3
      kernel/x86_64/dger_microk_sandy-2.c
  70. +1
    -1
      kernel/x86_64/dscal.c
  71. +6
    -6
      kernel/x86_64/dscal_microk_bulldozer-2.c
  72. +6
    -6
      kernel/x86_64/dscal_microk_haswell-2.c
  73. +6
    -6
      kernel/x86_64/dscal_microk_sandy-2.c
  74. +2
    -2
      kernel/x86_64/dsymv_L_microk_bulldozer-2.c
  75. +2
    -2
      kernel/x86_64/dsymv_L_microk_haswell-2.c
  76. +2
    -2
      kernel/x86_64/dsymv_L_microk_nehalem-2.c
  77. +2
    -2
      kernel/x86_64/dsymv_L_microk_sandy-2.c
  78. +3
    -3
      kernel/x86_64/dsymv_U_microk_bulldozer-2.c
  79. +3
    -3
      kernel/x86_64/dsymv_U_microk_haswell-2.c
  80. +3
    -3
      kernel/x86_64/dsymv_U_microk_nehalem-2.c
  81. +3
    -3
      kernel/x86_64/dsymv_U_microk_sandy-2.c
  82. +49
    -49
      kernel/x86_64/dtrsm_kernel_RN_haswell.c
  83. +48
    -48
      kernel/x86_64/dtrsm_kernel_RT_bulldozer.c
  84. +1
    -1
      kernel/x86_64/gemm_kernel_4x8_nano.S
  85. +1
    -1
      kernel/x86_64/gemm_kernel_8x4_sse.S
  86. +4
    -4
      kernel/x86_64/saxpy_microk_haswell-2.c
  87. +3
    -3
      kernel/x86_64/saxpy_microk_nehalem-2.c
  88. +8
    -8
      kernel/x86_64/saxpy_microk_piledriver-2.c
  89. +4
    -4
      kernel/x86_64/saxpy_microk_sandy-2.c
  90. +4
    -4
      kernel/x86_64/sdot_microk_bulldozer-2.c
  91. +4
    -4
      kernel/x86_64/sdot_microk_haswell-2.c
  92. +4
    -4
      kernel/x86_64/sdot_microk_nehalem-2.c
  93. +4
    -4
      kernel/x86_64/sdot_microk_sandy-2.c
  94. +8
    -8
      kernel/x86_64/sdot_microk_steamroller-2.c
  95. +7
    -7
      kernel/x86_64/sgemv_n_4.c
  96. +97
    -97
      kernel/x86_64/sgemv_n_microk_bulldozer-4.c
  97. +62
    -63
      kernel/x86_64/sgemv_n_microk_haswell-4.c
  98. +27
    -27
      kernel/x86_64/sgemv_n_microk_nehalem-4.c
  99. +65
    -65
      kernel/x86_64/sgemv_n_microk_sandy-4.c
  100. +9
    -9
      kernel/x86_64/sgemv_t_4.c

+ 2
- 1
.travis.yml View File

@@ -149,7 +149,7 @@ matrix:

- &test-macos
os: osx
osx_image: xcode8.3
osx_image: xcode10.1
before_script:
- COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32"
- brew update
@@ -160,6 +160,7 @@ matrix:
- BTYPE="BINARY=64 INTERFACE64=1"

- <<: *test-macos
osx_image: xcode8.3
env:
- BTYPE="BINARY=32"



+ 29
- 13
CMakeLists.txt View File

@@ -42,6 +42,19 @@ endif()

#######

if(MSVC AND MSVC_STATIC_CRT)
set(CompilerFlags
CMAKE_CXX_FLAGS
CMAKE_CXX_FLAGS_DEBUG
CMAKE_CXX_FLAGS_RELEASE
CMAKE_C_FLAGS
CMAKE_C_FLAGS_DEBUG
CMAKE_C_FLAGS_RELEASE
)
foreach(CompilerFlag ${CompilerFlags})
string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}")
endforeach()
endif()

message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.")

@@ -62,10 +75,10 @@ endif ()

set(SUBDIRS ${BLASDIRS})
if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack)
if(BUILD_RELAPACK)
list(APPEND SUBDIRS relapack/src)
endif()
list(APPEND SUBDIRS lapack)
endif ()

# set which float types we want to build for
@@ -134,7 +147,7 @@ endif ()

# Only generate .def for dll on MSVC and always produce pdb files for debug and release
if(MSVC)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
set(OpenBLAS_DEF_FILE "${PROJECT_BINARY_DIR}/openblas.def")
endif()
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /Zi")
@@ -149,15 +162,9 @@ if (${DYNAMIC_ARCH})
endforeach()
endif ()

# Only build shared libs for MSVC
if (MSVC)
set(BUILD_SHARED_LIBS ON)
endif()


# add objects to the openblas lib
add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE})
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include>)
target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>)

# Android needs to explicitly link against libm
if(ANDROID)
@@ -166,7 +173,7 @@ endif()

# Handle MSVC exports
if(MSVC AND BUILD_SHARED_LIBS)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} LESS 3.4)
if (${CMAKE_MAJOR_VERSION}.${CMAKE_MINOR_VERSION} VERSION_LESS 3.4)
include("${PROJECT_SOURCE_DIR}/cmake/export.cmake")
else()
# Creates verbose .def file (51KB vs 18KB)
@@ -217,6 +224,14 @@ set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
SOVERSION ${OpenBLAS_MAJOR_VERSION}
)

if (BUILD_SHARED_LIBS AND BUILD_RELAPACK)
if (NOT MSVC)
target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition")
else()
target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE")
endif()
endif()

if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "")
if (NOT DEFINED ARCH)
set(ARCH_IN "x86_64")
@@ -314,7 +329,7 @@ install (FILES ${OPENBLAS_CONFIG_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
if(NOT NOFORTRAN)
message(STATUS "Generating f77blas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

set(F77BLAS_H ${CMAKE_BINARY_DIR}/f77blas.h)
set(F77BLAS_H ${CMAKE_BINARY_DIR}/generated/f77blas.h)
file(WRITE ${F77BLAS_H} "#ifndef OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#define OPENBLAS_F77BLAS_H\n")
file(APPEND ${F77BLAS_H} "#include \"openblas_config.h\"\n")
@@ -327,10 +342,11 @@ endif()
if(NOT NO_CBLAS)
message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}")

set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h)
file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS)
string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}")
file(WRITE ${CMAKE_BINARY_DIR}/cblas.tmp "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CMAKE_BINARY_DIR}/cblas.tmp DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} RENAME cblas.h)
file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}")
install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
endif()

if(NOT NO_LAPACKE)


+ 1
- 1
Makefile View File

@@ -96,7 +96,7 @@ endif
@echo

shared :
ifndef NO_SHARED
ifneq ($(NO_SHARED), 1)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@$(MAKE) -C exports so
@ln -fs $(LIBSONAME) $(LIBPREFIX).so


+ 5
- 0
Makefile.arm64 View File

@@ -38,3 +38,8 @@ ifeq ($(CORE), THUNDERX2T99)
CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99
endif

ifeq ($(CORE), TSV110)
CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110
endif

+ 5
- 5
Makefile.install View File

@@ -58,14 +58,14 @@ ifndef NO_LAPACKE
endif

#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku))
@install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@@ -106,14 +106,14 @@ ifndef NO_LAPACKE
endif

#for install static library
ifndef NO_STATIC
ifneq ($(NO_STATIC),1)
@echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX)
endif
#for install shared library
ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
@echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR)
@installbsd -c -m 755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
@cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \
@@ -138,7 +138,7 @@ endif
@echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"
@echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"

ifndef NO_SHARED
ifneq ($(NO_SHARED),1)
#ifeq logical or
ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD OpenBSD DragonFly))
@echo "SET(OpenBLAS_LIBRARIES ${OPENBLAS_LIBRARY_DIR}/$(LIBPREFIX).so)" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG)"


+ 40
- 21
Makefile.rule View File

@@ -48,6 +48,8 @@ VERSION = 0.3.6.dev
# HOSTCC = gcc

# If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
# Please note that AVX is not available on 32-bit.
# Setting BINARY=32 disables AVX/AVX2/AVX-512.
# BINARY=64

# About threaded BLAS. It will be automatically detected if you don't
@@ -57,7 +59,7 @@ VERSION = 0.3.6.dev
# USE_THREAD = 0

# If you're going to use this library with OpenMP, please comment it in.
# This flag is always set for POWER8. Don't modify the flag
# This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
# USE_OPENMP = 1

# The OpenMP scheduler to use - by default this is "static" and you
@@ -68,36 +70,45 @@ VERSION = 0.3.6.dev
# allow you to select the scheduler from the environment variable OMP_SCHEDULE
# CCOMMON_OPT += -DOMP_SCHED=dynamic

# You can define maximum number of threads. Basically it should be
# less than actual number of cores. If you don't specify one, it's
# automatically detected by the the script.
# You can define the maximum number of threads. Basically it should be less
# than or equal to the number of CPU threads. If you don't specify one, it's
# automatically detected by the build system.
# If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
# restrict NUM_THREADS to the number of physical cores. By default, the automatic
# detection includes logical CPUs, thus allowing the use of SMT.
# Users may opt at runtime to use less than NUM_THREADS threads.
#
# Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
# value (eg. 32-256) if you expect your users to use that many threads. Due to the way
# some internal structures are allocated, using a large NUM_THREADS value has a RAM
# footprint penalty, even if users reduce the actual number of threads at runtime.
# NUM_THREADS = 24

# If you have enabled USE_OPENMP and your application would call
# OpenBLAS's calculation API from multi threads, please comment it in.
# This flag defines how many instances of OpenBLAS's calculation API can
# actually run in parallel. If more threads call OpenBLAS's calculation API,
# OpenBLAS's calculation API from multiple threads, please comment this in.
# This flag defines how many instances of OpenBLAS's calculation API can actually
# run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
# they need to wait for the preceding API calls to finish or risk data corruption.
# NUM_PARALLEL = 2

# if you don't need to install the static library, please comment it in.
# If you don't need to install the static library, please comment this in.
# NO_STATIC = 1

# if you don't need generate the shared library, please comment it in.
# If you don't need to generate the shared library, please comment this in.
# NO_SHARED = 1

# If you don't need CBLAS interface, please comment it in.
# If you don't need the CBLAS interface, please comment this in.
# NO_CBLAS = 1

# If you only want CBLAS interface without installing Fortran compiler,
# please comment it in.
# If you only want the CBLAS interface without installing a Fortran compiler,
# please comment this in.
# ONLY_CBLAS = 1

# If you don't need LAPACK, please comment it in.
# If you set NO_LAPACK=1, the library automatically sets NO_LAPACKE=1.
# If you don't need LAPACK, please comment this in.
# If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
# NO_LAPACK = 1

# If you don't need LAPACKE (C Interface to LAPACK), please comment it in.
# If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
# NO_LAPACKE = 1

# Build LAPACK Deprecated functions since LAPACK 3.6.0
@@ -106,7 +117,7 @@ BUILD_LAPACK_DEPRECATED = 1
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1

# If you want to use legacy threaded Level 3 implementation.
# If you want to use the legacy threaded Level 3 implementation.
# USE_SIMPLE_THREADED_LEVEL3 = 1

# If you want to use the new, still somewhat experimental code that uses
@@ -116,8 +127,8 @@ BUILD_LAPACK_DEPRECATED = 1
# USE_TLS = 1

# If you want to drive whole 64bit region by BLAS. Not all Fortran
# compiler supports this. It's safe to keep comment it out if you
# are not sure(equivalent to "-i8" option).
# compilers support this. It's safe to keep this commented out if you
# are not sure. (This is equivalent to the "-i8" ifort option).
# INTERFACE64 = 1

# Unfortunately most of kernel won't give us high quality buffer.
@@ -125,10 +136,18 @@ BUILD_LAPACK_DEPRECATED = 1
# but it will consume time. If you don't like it, you can disable one.
NO_WARMUP = 1

# If you want to disable CPU/Memory affinity on Linux.
# Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
# This feature is only implemented on Linux, and is always disabled on other platforms.
# Enabling affinity handling may improve performance, especially on NUMA systems, but
# it may conflict with certain applications that also try to manage affinity.
# This conflict can result in threads of the application calling OpenBLAS ending up locked
# to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
# For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
# else modifies affinity settings.
# Note: enabling affinity has been known to cause problems with NumPy and R
NO_AFFINITY = 1

# if you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
# BIGNUMA = 1

# Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
@@ -180,7 +199,7 @@ NO_AFFINITY = 1
# been reported to be optimal for certain workloads (50 is the recommended value for Julia).
# GEMM_MULTITHREAD_THRESHOLD = 4

# If you need santy check by comparing reference BLAS. It'll be very
# If you need sanity check by comparing results to reference BLAS. It'll be very
# slow (Not implemented yet).
# SANITY_CHECK = 1



+ 5
- 1
Makefile.system View File

@@ -95,6 +95,9 @@ endif
ifeq ($(TARGET), ZEN)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), ARMV8)
GETARCH_FLAGS := -DFORCE_ARMV7
endif
endif


@@ -152,7 +155,8 @@ GETARCH_FLAGS += -DNO_AVX
endif

ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
GETARCH_FLAGS += -DNO_AVX -DNO_AVX2 -DNO_AVX512
NO_AVX512 = 1
endif

ifeq ($(NO_AVX2), 1)


+ 4
- 0
Makefile.zarch View File

@@ -4,3 +4,7 @@ CCOMMON_OPT += -march=z13 -mzvector
FCOMMON_OPT += -march=z13 -mzvector
endif

ifeq ($(CORE), Z14)
CCOMMON_OPT += -march=z14 -mzvector
FCOMMON_OPT += -march=z14 -mzvector
endif

+ 2
- 0
TargetList.txt View File

@@ -91,7 +91,9 @@ CORTEXA73
FALKOR
THUNDERX
THUNDERX2T99
TSV110

9.System Z:
ZARCH_GENERIC
Z13
Z14

+ 2
- 2
appveyor.yml View File

@@ -53,9 +53,9 @@ before_build:
- ps: if (-Not (Test-Path .\build)) { mkdir build }
- cd build
- if [%COMPILER%]==[cl] cmake -G "Visual Studio 15 2017 Win64" ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl ..
- if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON ..
- if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON ..
- if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' ..

build_script:
- cmake --build .


+ 5
- 12
benchmark/scripts/R/deig.R View File

@@ -2,6 +2,8 @@

argv <- commandArgs(trailingOnly = TRUE)

if (!is.null(options("matprod")[[1]])) options(matprod = "blas")

nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}

}

p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,29 +28,21 @@ if (p != "") {
loops <- as.numeric(p)
}


cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))

n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
ev <- 0
z <- system.time(for (l in 1:loops) {
ev <- eigen(A)
})

mflops <- (26.66 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (26.66 * n * n * n) * loops / (z[3] * 1e+06)

st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))

n <- n + nstep

}

+ 6
- 19
benchmark/scripts/R/dgemm.R View File

@@ -2,6 +2,8 @@

argv <- commandArgs(trailingOnly = TRUE)

if (!is.null(options("matprod")[[1]])) options(matprod = "blas")

nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}

}

p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,26 +28,13 @@ if (p != "") {
loops <- as.numeric(p)
}


cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))

n <- nfrom
while (n <= nto) {
A <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
B <- matrix(runif(n * n),
ncol = n,
nrow = n,
byrow = TRUE)
A <- matrix(runif(n * n), nrow = n)
B <- matrix(runif(n * n), nrow = n)
C <- 1

z <- system.time(for (l in 1:loops) {
@@ -54,11 +42,10 @@ while (n <= nto) {
l <- l + 1
})

mflops <- (2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (2.0 * n * n * n) * loops / (z[3] * 1e+06)

st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))

n <- n + nstep

}

+ 6
- 14
benchmark/scripts/R/dsolve.R View File

@@ -2,6 +2,8 @@

argv <- commandArgs(trailingOnly = TRUE)

if (!is.null(options("matprod")[[1]])) options(matprod = "blas")

nfrom <- 128
nto <- 2048
nstep <- 128
@@ -19,7 +21,6 @@ if (length(argv) > 0) {
loops <- as.numeric(argv[z])
}
}

}

p <- Sys.getenv("OPENBLAS_LOOPS")
@@ -27,31 +28,22 @@ if (p != "") {
loops <- as.numeric(p)
}


cat(sprintf(
"From %.0f To %.0f Step=%.0f Loops=%.0f\n",
nfrom,
nto,
nstep,
loops
))
cat(sprintf("From %.0f To %.0f Step=%.0f Loops=%.0f\n", nfrom, nto, nstep, loops))
cat(sprintf(" SIZE Flops Time\n"))

n <- nfrom
while (n <= nto) {
A <- matrix(rnorm(n * n), ncol = n, nrow = n)
B <- matrix(rnorm(n * n), ncol = n, nrow = n)
A <- matrix(rnorm(n * n), nrow = n)
B <- matrix(rnorm(n * n), nrow = n)

z <- system.time(for (l in 1:loops) {
solve(A, B)
})

mflops <-
(2.0 / 3.0 * n * n * n + 2.0 * n * n * n) * loops / (z[3] * 1.0e6)
mflops <- (8.0 / 3 * n * n * n) * loops / (z[3] * 1e+06)

st <- sprintf("%.0fx%.0f :", n, n)
cat(sprintf("%20s %10.2f MFlops %10.6f sec\n", st, mflops, z[3]))

n <- n + nstep

}

+ 56
- 29
c_check View File

@@ -1,7 +1,7 @@
#!/usr/bin/perl

use File::Basename;
use File::Temp qw(tempfile);
#use File::Basename;
# use File::Temp qw(tempfile);

# Checking cross compile
$hostos = `uname -s | sed -e s/\-.*//`; chop($hostos);
@@ -12,7 +12,7 @@ $hostarch = "arm64" if ($hostarch eq "aarch64");
$hostarch = "power" if ($hostarch =~ /^(powerpc|ppc).*/);
$hostarch = "zarch" if ($hostarch eq "s390x");

$tmpf = new File::Temp( UNLINK => 1 );
#$tmpf = new File::Temp( UNLINK => 1 );
$binary = $ENV{"BINARY"};

$makefile = shift(@ARGV);
@@ -31,12 +31,25 @@ if ($?) {

$cross_suffix = "";

if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}
eval "use File::Basename";
if ($@){
warn "could not load PERL module File::Basename, emulating its functionality";
my $dirnam = substr($compiler_name, 0, rindex($compiler_name, "/")-1 );
if ($dirnam ne ".") {
$cross_suffix .= $dirnam . "/";
}
my $basnam = substr($compiler_name, rindex($compiler_name,"/")+1, length($compiler_name)-rindex($compiler_name,"/")-1);
if ($basnam =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
} else {
if (dirname($compiler_name) ne ".") {
$cross_suffix .= dirname($compiler_name) . "/";
}

if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
if (basename($compiler_name) =~ /([^\s]*-)(.*)/) {
$cross_suffix .= $1;
}
}

$compiler = "";
@@ -171,20 +184,26 @@ if ($?) {

$have_msa = 0;
if (($architecture eq "mips") || ($architecture eq "mips64")) {
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";

$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check MSA capatibility";
} else {
$have_msa = 1;
$tmpf = new File::Temp( UNLINK => 1 );
$code = '"addvi.b $w0, $w1, 1"';
$msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs";
print $tmpf "#include <msa.h>\n\n";
print $tmpf "void main(void){ __asm__ volatile($code); }\n";

$args = "$msa_flags -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args");
system(@cmd) == 0;
if ($? != 0) {
$have_msa = 0;
} else {
$have_msa = 1;
}
unlink("$tmpf.o");
}
unlink("$tmpf.o");
}

$architecture = x86 if ($data =~ /ARCH_X86/);
@@ -204,17 +223,25 @@ $binformat = bin64 if ($data =~ /BINARY_64/);

$no_avx512= 0;
if (($architecture eq "x86") || ($architecture eq "x86_64")) {
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
eval "use File::Temp qw(tempfile)";
if ($@){
warn "could not load PERL module File::Temp, so could not check compiler compatibility with AVX512";
$no_avx512 = 0;
} else {
# $tmpf = new File::Temp( UNLINK => 1 );
($fh,$tmpf) = tempfile( UNLINK => 1 );
$code = '"vbroadcastss -4 * 4(%rsi), %zmm2"';
print $tmpf "#include <immintrin.h>\n\nint main(void){ __asm__ volatile($code); }\n";
$args = " -march=skylake-avx512 -c -o $tmpf.o -x c $tmpf";
my @cmd = ("$compiler_name $args >/dev/null 2>/dev/null");
system(@cmd) == 0;
if ($? != 0) {
$no_avx512 = 1;
} else {
$no_avx512 = 0;
}
unlink("tmpf.o");
}
unlink("tmpf.o");
}

$data = `$compiler_name -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`;


+ 3
- 0
cmake/arch.cmake View File

@@ -74,6 +74,9 @@ if (DYNAMIC_ARCH)
if (NOT NO_AVX512)
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX)
endif ()
if (DYNAMIC_LIST)
set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST})
endif ()
endif ()

if (NOT DYNAMIC_CORE)


+ 10
- 0
cmake/system.cmake View File

@@ -39,6 +39,9 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32)
if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN")
set(TARGET "BARCELONA")
endif ()
if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53")
set(TARGET "ARMV7")
endif ()
endif ()

if (DEFINED TARGET)
@@ -184,6 +187,13 @@ if (DYNAMIC_ARCH)
endif ()
endif ()

if (DYNAMIC_LIST)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_LIST")
foreach(DCORE ${DYNAMIC_LIST})
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYN_${DCORE}")
endforeach ()
endif ()

if (NO_LAPACK)
set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_LAPACK")
#Disable LAPACK C interface


+ 6
- 2
cmake/system_check.cmake View File

@@ -39,7 +39,11 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
set(MIPS64 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*")
set(X86_64 1)
if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
set(X86_64 1)
else()
set(X86 1)
endif()
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i686.*|i386.*|x86.*|amd64.*|AMD64.*")
set(X86 1)
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm.*|ARM.*)")
@@ -78,7 +82,7 @@ endif()

if (X86_64 OR X86)
file(WRITE ${PROJECT_BINARY_DIR}/avx512.tmp "#include <immintrin.h>\n\nint main(void){ __asm__ volatile(\"vbroadcastss -4 * 4(%rsi), %zmm2\"); }")
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
execute_process(COMMAND ${CMAKE_C_COMPILER} -march=skylake-avx512 -c -v -o ${PROJECT_BINARY_DIR}/avx512.o -x c ${PROJECT_BINARY_DIR}/avx512.tmp OUTPUT_QUIET ERROR_QUIET RESULT_VARIABLE NO_AVX512)
if (NO_AVX512 EQUAL 1)
set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512")
endif()


+ 1
- 1
common.h View File

@@ -444,7 +444,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) 0
#else
#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
typedef char env_var_t[MAX_PATH];
#define readenv(p, n) GetEnvironmentVariable((LPCTSTR)(n), (LPTSTR)(p), sizeof(p))
#else


+ 13
- 3
common_power.h View File

@@ -241,7 +241,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
#define HAVE_PREFETCH
#endif

#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9)
#if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || ( defined(PPC970) && defined(OS_DARWIN) )
#define DCBT_ARG 0
#else
#define DCBT_ARG 8
@@ -598,9 +598,14 @@ REALNAME:;\
#ifndef __64BIT__
#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.long .REALNAME, TOC[tc0], 0;\
.csect .text[PR],5;\
.REALNAME:;
.REALNAME:

#define EPILOGUE \
_section_.text:;\
@@ -611,9 +616,14 @@ _section_.text:;\

#define PROLOGUE \
.machine "any";\
.toc;\
.globl .REALNAME;\
.globl REALNAME;\
.csect REALNAME[DS],3;\
REALNAME:;\
.llong .REALNAME, TOC[tc0], 0;\
.csect .text[PR], 5;\
.REALNAME:;
.REALNAME:

#define EPILOGUE \
_section_.text:;\


+ 1
- 1
common_x86.h View File

@@ -187,7 +187,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];

__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x): "0" (y));

return result;
#endif


+ 1
- 1
common_x86_64.h View File

@@ -210,7 +210,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
y = blas_quick_divide_table[y];

__asm__ __volatile__ ("mull %0" :"=d" (result) :"a"(x), "0" (y));
__asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y));

return result;
}


+ 24
- 2
cpuid_arm64.c View File

@@ -39,6 +39,8 @@
// Cavium
#define CPU_THUNDERX 7
#define CPU_THUNDERX2T99 8
//Hisilicon
#define CPU_TSV110 9

static char *cpuname[] = {
"UNKNOWN",
@@ -49,7 +51,8 @@ static char *cpuname[] = {
"CORTEXA73",
"FALKOR",
"THUNDERX",
"THUNDERX2T99"
"THUNDERX2T99",
"TSV110"
};

static char *cpuname_lower[] = {
@@ -61,7 +64,8 @@ static char *cpuname_lower[] = {
"cortexa73",
"falkor",
"thunderx",
"thunderx2t99"
"thunderx2t99",
"tsv110"
};

int get_feature(char *search)
@@ -145,6 +149,9 @@ int detect(void)
return CPU_THUNDERX;
else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af"))
return CPU_THUNDERX2T99;
// HiSilicon
else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01"))
return CPU_TSV110;
}

p = (char *) NULL ;
@@ -286,6 +293,21 @@ void get_cpuconfig(void)
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
case CPU_TSV110:
printf("#define TSV110 \n");
printf("#define L1_CODE_SIZE 65536 \n");
printf("#define L1_CODE_LINESIZE 64 \n");
printf("#define L1_CODE_ASSOCIATIVE 4 \n");
printf("#define L1_DATA_SIZE 65536 \n");
printf("#define L1_DATA_LINESIZE 64 \n");
printf("#define L1_DATA_ASSOCIATIVE 4 \n");
printf("#define L2_SIZE 524228 \n");
printf("#define L2_LINESIZE 64 \n");
printf("#define L2_ASSOCIATIVE 8 \n");
printf("#define DTB_DEFAULT_ENTRIES 64 \n");
printf("#define DTB_SIZE 4096 \n");
break;
}
}



+ 5
- 3
cpuid_x86.c View File

@@ -228,7 +228,7 @@ int support_avx2(){
}

int support_avx512(){
#ifndef NO_AVX512
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;

@@ -1359,6 +1359,8 @@ int get_cpuname(void){
return CPUTYPE_NEHALEM;
case 12:
// Apollo Lake
case 15:
// Denverton
return CPUTYPE_NEHALEM;
}
break;
@@ -1376,9 +1378,9 @@ int get_cpuname(void){
}
break;
case 9:
case 8:
case 8:
switch (model) {
case 14: // Kaby Lake
case 14: // Kaby Lake and refreshes
if(support_avx2())
return CPUTYPE_HASWELL;
if(support_avx())


+ 12
- 7
cpuid_zarch.c View File

@@ -27,9 +27,9 @@

#include <string.h>

#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2
#define CPU_GENERIC 0
#define CPU_Z13 1
#define CPU_Z14 2

static char *cpuname[] = {
"ZARCH_GENERIC",
@@ -64,10 +64,8 @@ int detect(void)

if (strstr(p, "2964")) return CPU_Z13;
if (strstr(p, "2965")) return CPU_Z13;

/* detect z14, but fall back to z13 */
if (strstr(p, "3906")) return CPU_Z13;
if (strstr(p, "3907")) return CPU_Z13;
if (strstr(p, "3906")) return CPU_Z14;
if (strstr(p, "3907")) return CPU_Z14;

return CPU_GENERIC;
}
@@ -116,7 +114,14 @@ void get_cpuconfig(void)
break;
case CPU_Z14:
printf("#define Z14\n");
printf("#define L1_DATA_SIZE 131072\n");
printf("#define L1_DATA_LINESIZE 256\n");
printf("#define L1_DATA_ASSOCIATIVE 8\n");
printf("#define L2_SIZE 4194304\n");
printf("#define L2_LINESIZE 256\n");
printf("#define L2_ASSOCIATIVE 8\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
break;
}
}

+ 1
- 1
ctest.c View File

@@ -113,7 +113,7 @@ ARCH_X86
ARCH_X86_64
#endif

#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER)
#if defined(__powerpc___) || defined(__PPC__) || defined(_POWER) || defined(__POWERPC__)
ARCH_POWER
#endif



+ 2
- 2
driver/level2/trmv_thread.c View File

@@ -346,7 +346,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu

range_m[MAX_CPU_NUMBER - num_cpu - 1] = range_m[MAX_CPU_NUMBER - num_cpu] - width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;

queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;
@@ -386,7 +386,7 @@ int CNAME(BLASLONG m, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *bu

range_m[num_cpu + 1] = range_m[num_cpu] + width;
range_n[num_cpu] = num_cpu * (((m + 15) & ~15) + 16);
if (range_n[num_cpu] > m) range_n[num_cpu] = m;
if (range_n[num_cpu] > m * num_cpu) range_n[num_cpu] = m * num_cpu;

queue[num_cpu].mode = mode;
queue[num_cpu].routine = trmv_kernel;


+ 5
- 0
driver/others/blas_server_win32.c View File

@@ -461,13 +461,18 @@ int BLASFUNC(blas_thread_shutdown)(void){
SetEvent(pool.killed);

for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
WaitForSingleObject(blas_threads[i], 5); //INFINITE);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
TerminateThread(blas_threads[i],0);
#endif
CloseHandle(blas_threads[i]);
}

CloseHandle(pool.filled);
CloseHandle(pool.killed);

blas_server_avail = 0;
}



+ 3
- 3
driver/others/dynamic.c View File

@@ -322,7 +322,7 @@ int support_avx2(){
}

int support_avx512(){
#ifndef NO_AVX512
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;

@@ -566,8 +566,8 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
}
}
//Apollo Lake
if (model == 12) {
//Apollo Lake or Denverton
if (model == 12 || model == 15) {
return &gotoblas_NEHALEM;
}
return NULL;


+ 165
- 71
driver/others/memory.c View File

@@ -198,45 +198,68 @@ int get_num_procs(void);
#else
int get_num_procs(void) {
static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;

#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif

if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif

#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif

#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@@ -1290,6 +1313,13 @@ void blas_memory_free_nolock(void * map_address) {
free(map_address);
}

#ifdef SMP
void blas_thread_memory_cleanup(void) {
blas_memory_cleanup((void*)get_memory_table());
}
#endif


void blas_shutdown(void){
#ifdef SMP
BLASFUNC(blas_thread_shutdown)();
@@ -1299,7 +1329,7 @@ void blas_shutdown(void){
/* Only cleanupIf we were built for threading and TLS was initialized */
if (local_storage_key)
#endif
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();

#ifdef SEEK_ADDRESS
base_address = 0UL;
@@ -1529,7 +1559,7 @@ BOOL APIENTRY DllMain(HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReser
break;
case DLL_THREAD_DETACH:
#if defined(SMP)
blas_memory_cleanup((void*)get_memory_table());
blas_thread_memory_cleanup();
#endif
break;
case DLL_PROCESS_DETACH:
@@ -1603,9 +1633,11 @@ void gotoblas_dummy_for_PGI(void) {
#endif

#else
/* USE_TLS / COMPILE_TLS not set */

#include <errno.h>

#ifdef OS_WINDOWS
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
#define ALLOC_WINDOWS
#ifndef MEM_LARGE_PAGES
#define MEM_LARGE_PAGES 0x20000000
@@ -1619,7 +1651,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <stdio.h>
#include <fcntl.h>

#ifndef OS_WINDOWS
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
#include <sys/mman.h>
#ifndef NO_SYSV_IPC
#include <sys/shm.h>
@@ -1639,7 +1671,7 @@ void gotoblas_dummy_for_PGI(void) {
#include <sys/resource.h>
#endif

#if defined(OS_FREEBSD) || defined(OS_DARWIN)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
#include <sys/sysctl.h>
#include <sys/resource.h>
#endif
@@ -1678,9 +1710,12 @@ void gotoblas_dummy_for_PGI(void) {
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#else
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
#define CONSTRUCTOR __attribute__ ((constructor(101)))
#define DESTRUCTOR __attribute__ ((destructor(101)))
#else
#define CONSTRUCTOR __attribute__ ((constructor))
#define DESTRUCTOR __attribute__ ((destructor))
#endif

#ifdef DYNAMIC_ARCH
@@ -1704,45 +1739,70 @@ void goto_set_num_threads(int num_threads) {};
int get_num_procs(void);
#else
int get_num_procs(void) {

static int nums = 0;
cpu_set_t *cpusetp;
size_t size;
int ret;
int i,n;
cpu_set_t cpuset,*cpusetp;
size_t size;
int ret;

#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7)
int i;
#if !__GLIBC_PREREQ(2, 6)
int n;
#endif
#endif
#endif

if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
#if !defined(OS_LINUX)
return nums;
return nums;
#endif

#if !defined(__GLIBC_PREREQ)
return nums;
return nums;
#else
#if !__GLIBC_PREREQ(2, 3)
return nums;
return nums;
#endif

#if !__GLIBC_PREREQ(2, 7)
ret = sched_getaffinity(0,sizeof(cpu_set_t), cpusetp);
ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
if (ret!=0) return nums;
n=0;
#if !__GLIBC_PREREQ(2, 6)
for (i=0;i<nums;i++)
if (CPU_ISSET(i,cpusetp)) n++;
if (CPU_ISSET(i,cpuset)) n++;
nums=n;
#else
nums = CPU_COUNT(sizeof(cpu_set_t),cpusetp);
nums = CPU_COUNT(sizeof(cpuset),&cpuset);
#endif
return nums;
#else
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) return nums;
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) return nums;
nums = CPU_COUNT_S(size,cpusetp);
CPU_FREE(cpusetp);
return nums;
if (nums >= CPU_SETSIZE) {
cpusetp = CPU_ALLOC(nums);
if (cpusetp == NULL) {
return nums;
}
size = CPU_ALLOC_SIZE(nums);
ret = sched_getaffinity(0,size,cpusetp);
if (ret!=0) {
CPU_FREE(cpusetp);
return nums;
}
ret = CPU_COUNT_S(size,cpusetp);
if (ret > 0 && ret < nums) nums = ret;
CPU_FREE(cpusetp);
return nums;
} else {
ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
if (ret!=0) {
return nums;
}
ret = CPU_COUNT(&cpuset);
if (ret > 0 && ret < nums) nums = ret;
return nums;
}
#endif
#endif
}
@@ -1756,7 +1816,7 @@ int get_num_procs(void) {
return nums;
}
#endif
#ifdef OS_HAIKU
int get_num_procs(void) {
static int nums = 0;
@@ -1793,7 +1853,7 @@ int get_num_procs(void) {

#endif

#if defined(OS_FREEBSD)
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY)

int get_num_procs(void) {

@@ -1870,7 +1930,7 @@ void openblas_fork_handler()
// http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
// In the mean time build with USE_OPENMP=0 or link against another
// implementation of OpenMP.
#if !(defined(OS_WINDOWS) || defined(OS_ANDROID)) && defined(SMP_SERVER)
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
int err;
err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
if(err != 0)
@@ -1883,7 +1943,7 @@ extern int openblas_goto_num_threads_env();
extern int openblas_omp_num_threads_env();

int blas_get_cpu_number(void){
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
int max_num;
#endif
int blas_goto_num = 0;
@@ -1891,11 +1951,11 @@ int blas_get_cpu_number(void){

if (blas_num_threads) return blas_num_threads;

#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
max_num = get_num_procs();
#endif

blas_goto_num = 0;
// blas_goto_num = 0;
#ifndef USE_OPENMP
blas_goto_num=openblas_num_threads_env();
if (blas_goto_num < 0) blas_goto_num = 0;
@@ -1907,7 +1967,7 @@ int blas_get_cpu_number(void){

#endif

blas_omp_num = 0;
// blas_omp_num = 0;
blas_omp_num=openblas_omp_num_threads_env();
if (blas_omp_num < 0) blas_omp_num = 0;

@@ -1915,7 +1975,7 @@ int blas_get_cpu_number(void){
else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
else blas_num_threads = MAX_CPU_NUMBER;

#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) || defined(OS_ANDROID)
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID)
if (blas_num_threads > max_num) blas_num_threads = max_num;
#endif

@@ -2002,11 +2062,15 @@ static void *alloc_mmap(void *address){
}

if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}

#ifdef OS_LINUX
@@ -2148,14 +2212,18 @@ static void *alloc_mmap(void *address){
#if defined(OS_LINUX) && !defined(NO_WARMUP)
}
#endif
LOCK_COMMAND(&alloc_lock);

if (map_address != (void *)-1) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
release_info[release_pos].address = map_address;
release_info[release_pos].func = alloc_mmap_free;
release_pos ++;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
}
UNLOCK_COMMAND(&alloc_lock);

return map_address;
}
@@ -2523,7 +2591,7 @@ void *blas_memory_alloc(int procpos){

int position;
#if defined(WHEREAMI) && !defined(USE_OPENMP)
int mypos;
int mypos = 0;
#endif

void *map_address;
@@ -2554,6 +2622,11 @@ void *blas_memory_alloc(int procpos){
NULL,
};
void *(**func)(void *address);

#if defined(USE_OPENMP)
if (!memory_initialized) {
#endif

LOCK_COMMAND(&alloc_lock);

if (!memory_initialized) {
@@ -2589,6 +2662,9 @@ void *blas_memory_alloc(int procpos){

}
UNLOCK_COMMAND(&alloc_lock);
#if defined(USE_OPENMP)
}
#endif

#ifdef DEBUG
printf("Alloc Start ...\n");
@@ -2603,13 +2679,17 @@ void *blas_memory_alloc(int procpos){

do {
if (!memory[position].used && (memory[position].pos == mypos)) {
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
// blas_lock(&memory[position].lock);

#else
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
// blas_unlock(&memory[position].lock);
#else
blas_unlock(&memory[position].lock);
#endif
}

position ++;
@@ -2621,21 +2701,26 @@ void *blas_memory_alloc(int procpos){

position = 0;

#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
do {
/* if (!memory[position].used) { */
/* blas_lock(&memory[position].lock);*/

#if defined(USE_OPENMP)
if (!memory[position].used) {
blas_lock(&memory[position].lock);
#endif
if (!memory[position].used) goto allocation;
/* blas_unlock(&memory[position].lock);*/
/* } */

#if defined(USE_OPENMP)
blas_unlock(&memory[position].lock);
}
#endif
position ++;

} while (position < NUM_BUFFERS);
UNLOCK_COMMAND(&alloc_lock);

#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
goto error;

allocation :
@@ -2645,10 +2730,11 @@ void *blas_memory_alloc(int procpos){
#endif

memory[position].used = 1;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
/* blas_unlock(&memory[position].lock);*/

#else
blas_unlock(&memory[position].lock);
#endif
if (!memory[position].addr) {
do {
#ifdef DEBUG
@@ -2693,9 +2779,13 @@ void *blas_memory_alloc(int procpos){

} while ((BLASLONG)map_address == -1);

#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
memory[position].addr = map_address;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif

#ifdef DEBUG
printf(" Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
@@ -2749,8 +2839,9 @@ void blas_memory_free(void *free_area){
#endif

position = 0;
#if defined(SMP) && !defined(USE_OPENMP)
LOCK_COMMAND(&alloc_lock);
#endif
while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
position++;

@@ -2764,7 +2855,9 @@ void blas_memory_free(void *free_area){
WMB;

memory[position].used = 0;
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif

#ifdef DEBUG
printf("Unmap Succeeded.\n\n");
@@ -2779,8 +2872,9 @@ void blas_memory_free(void *free_area){
for (position = 0; position < NUM_BUFFERS; position++)
printf("%4ld %p : %d\n", position, memory[position].addr, memory[position].used);
#endif
#if defined(SMP) && !defined(USE_OPENMP)
UNLOCK_COMMAND(&alloc_lock);
#endif
return;
}



+ 9
- 0
exports/Makefile View File

@@ -141,6 +141,14 @@ else
$(OBJCOPY) --redefine-syms objcopy.def ../$(LIBNAME) ../$(LIBNAME).renamed
../$(LIBSONAME) : ../$(LIBNAME).renamed linktest.c
endif

ifeq ($(F_COMPILER), INTEL)
$(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
else

ifneq ($(C_COMPILER), LSB)
$(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \
-Wl,--whole-archive $< -Wl,--no-whole-archive \
@@ -152,6 +160,7 @@ else
-Wl,--whole-archive $< -Wl,--no-whole-archive \
-Wl,-soname,$(INTERNALNAME) $(EXTRALIB)
$(FC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK.
endif
endif
rm -f linktest



+ 17
- 7
exports/dllinit.c View File

@@ -40,15 +40,25 @@

void gotoblas_init(void);
void gotoblas_quit(void);
#if defined(SMP) && defined(USE_TLS)
void blas_thread_memory_cleanup(void);
#endif

BOOL APIENTRY DllMain(HINSTANCE hInst, DWORD reason, LPVOID reserved) {

if (reason == DLL_PROCESS_ATTACH) {
gotoblas_init();
}

if (reason == DLL_PROCESS_DETACH) {
gotoblas_quit();
switch(reason) {
case DLL_PROCESS_ATTACH:
gotoblas_init();
break;
case DLL_PROCESS_DETACH:
gotoblas_quit();
break;
case DLL_THREAD_ATTACH:
break;
case DLL_THREAD_DETACH:
#if defined(SMP) && defined(USE_TLS)
blas_thread_memory_cleanup();
#endif
break;
}

return TRUE;


+ 46
- 0
getarch.c View File

@@ -91,6 +91,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <unistd.h>
#endif

#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6))
#else
#define NO_AVX512
#endif
/* #define FORCE_P2 */
/* #define FORCE_KATMAI */
/* #define FORCE_COPPERMINE */
@@ -327,6 +331,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif

#ifdef FORCE_SKYLAKEX
#ifdef NO_AVX512
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "HASWELL"
#define ARCHCONFIG "-DHASWELL " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \
"-DFMA3"
#define LIBNAME "haswell"
#define CORENAME "HASWELL"
#else
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@@ -340,6 +358,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define LIBNAME "skylakex"
#define CORENAME "SKYLAKEX"
#endif
#endif

#ifdef FORCE_ATOM
#define FORCE
@@ -1058,6 +1077,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif

#ifdef FORCE_TSV110
#define FORCE
#define ARCHITECTURE "ARM64"
#define SUBARCHITECTURE "TSV110"
#define SUBDIRNAME "arm64"
#define ARCHCONFIG "-DTSV110 " \
"-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=4 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
"-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8"
#define LIBNAME "tsv110"
#define CORENAME "TSV110"
#else
#endif


#ifdef FORCE_ZARCH_GENERIC
#define FORCE
#define ARCHITECTURE "ZARCH"
@@ -1078,6 +1114,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "Z13"
#endif

#ifdef FORCE_Z14
#define FORCE
#define ARCHITECTURE "ZARCH"
#define SUBARCHITECTURE "Z14"
#define ARCHCONFIG "-DZ14 " \
"-DDTB_DEFAULT_ENTRIES=64"
#define LIBNAME "z14"
#define CORENAME "Z14"
#endif

#ifndef FORCE

#ifdef USER_TARGET


+ 1
- 4
interface/trmv.c View File

@@ -218,11 +218,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
buffer = (FLOAT *)blas_memory_alloc(1);

#ifdef SMP
/* nthreads = num_cpu_avail(2);
nthreads = num_cpu_avail(2);

FIXME trmv_thread was found to be broken, see issue 1332 */
nthreads = 1;
if (nthreads == 1) {
#endif



+ 12
- 2
interface/trsm.c View File

@@ -81,6 +81,12 @@
#endif
#endif

#ifndef COMPLEX
#define SMP_FACTOR 256
#else
#define SMP_FACTOR 128
#endif

static int (*trsm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
#ifndef TRMM
TRSM_LNUU, TRSM_LNUN, TRSM_LNLU, TRSM_LNLN,
@@ -366,11 +372,15 @@ void CNAME(enum CBLAS_ORDER order,
mode |= (trans << BLAS_TRANSA_SHIFT);
mode |= (side << BLAS_RSIDE_SHIFT);

if ( args.m < 2*GEMM_MULTITHREAD_THRESHOLD )
/*
if ( args.m < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
else
if ( args.n < 2*GEMM_MULTITHREAD_THRESHOLD )
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD )
args.nthreads = 1;
*/
if ( args.m * args.n < SMP_FACTOR * GEMM_MULTITHREAD_THRESHOLD)
args.nthreads = 1;
else
args.nthreads = num_cpu_avail(3);


+ 0
- 3
interface/ztrmv.c View File

@@ -239,9 +239,6 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
} else
nthreads = 1;

/* FIXME TRMV multithreading appears to be broken, see issue 1332*/
nthreads = 1;

if(nthreads > 1) {
buffer_size = n > 16 ? 0 : n * 4 + 40;
}


+ 5
- 1
kernel/Makefile.L3 View File

@@ -24,7 +24,7 @@ ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif

ifeq ($(TARGET), GENERIC)
ifeq ($(CORE), GENERIC)
USE_TRMM = 1
endif

@@ -52,6 +52,10 @@ ifeq ($(ARCH), zarch)
USE_TRMM = 1
endif

ifeq ($(CORE), Z14)
USE_TRMM = 1
endif






+ 1
- 1
kernel/arm/imin.c View File

@@ -53,7 +53,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)

while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];


+ 175
- 0
kernel/arm64/KERNEL.TSV110 View File

@@ -0,0 +1,175 @@
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c

SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c

SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c

ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c

ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c

ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c

STRMMKERNEL = ../generic/trmmkernel_4x4.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c

STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

SAMAXKERNEL = amax.S
DAMAXKERNEL = amax.S
CAMAXKERNEL = zamax.S
ZAMAXKERNEL = zamax.S

ISAMAXKERNEL = iamax.S
IDAMAXKERNEL = iamax.S
ICAMAXKERNEL = izamax.S
IZAMAXKERNEL = izamax.S

SASUMKERNEL = asum.S
DASUMKERNEL = asum.S
CASUMKERNEL = casum.S
ZASUMKERNEL = zasum.S

SAXPYKERNEL = axpy.S
DAXPYKERNEL = axpy.S
CAXPYKERNEL = zaxpy.S
ZAXPYKERNEL = zaxpy.S

SCOPYKERNEL = copy.S
DCOPYKERNEL = copy.S
CCOPYKERNEL = copy.S
ZCOPYKERNEL = copy.S

SDOTKERNEL = dot.S
DDOTKERNEL = dot.S
CDOTKERNEL = zdot.S
ZDOTKERNEL = zdot.S
DSDOTKERNEL = dot.S

SNRM2KERNEL = nrm2.S
DNRM2KERNEL = nrm2.S
CNRM2KERNEL = znrm2.S
ZNRM2KERNEL = znrm2.S

SROTKERNEL = rot.S
DROTKERNEL = rot.S
CROTKERNEL = zrot.S
ZROTKERNEL = zrot.S

SSCALKERNEL = scal.S
DSCALKERNEL = scal.S
CSCALKERNEL = zscal.S
ZSCALKERNEL = zscal.S

SSWAPKERNEL = swap.S
DSWAPKERNEL = swap.S
CSWAPKERNEL = swap.S
ZSWAPKERNEL = swap.S

SGEMVNKERNEL = gemv_n.S
DGEMVNKERNEL = gemv_n.S
CGEMVNKERNEL = zgemv_n.S
ZGEMVNKERNEL = zgemv_n.S

SGEMVTKERNEL = gemv_t.S
DGEMVTKERNEL = gemv_t.S
CGEMVTKERNEL = zgemv_t.S
ZGEMVTKERNEL = zgemv_t.S

SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)

DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S

ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))

ifeq ($(DGEMM_UNROLL_M), 8)
DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S
DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S
else
DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
endif

DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif

ifeq ($(DGEMM_UNROLL_N), 4)
DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S
DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S
else
DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
endif

DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)

CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)

ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
endif
ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)


+ 1
- 1
kernel/mips/imin.c View File

@@ -45,7 +45,7 @@ BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)

while(i < n)
{
if( x[ix] > minf )
if( x[ix] < minf )
{
min = i;
minf = x[ix];


+ 1
- 1
kernel/power/gemm_beta.S View File

@@ -129,7 +129,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)

dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4


+ 1
- 1
kernel/power/zgemm_beta.S View File

@@ -134,7 +134,7 @@ LL(12):
STFD f0, 14 * SIZE(CO1)
STFD f0, 15 * SIZE(CO1)

dcbst PRE, CO1
dcbtst PRE, CO1
addi CO1, CO1, 16 * SIZE
bdnz LL(12)
.align 4


+ 7
- 7
kernel/x86_64/caxpy_microk_bulldozer-2.c View File

@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4


+ 3
- 3
kernel/x86_64/caxpy_microk_haswell-2.c View File

@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4


+ 4
- 4
kernel/x86_64/caxpy_microk_sandy-2.c View File

@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4


+ 7
- 7
kernel/x86_64/caxpy_microk_steamroller-2.c View File

@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4
@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha), // 4


+ 7
- 7
kernel/x86_64/cdot_microk_bulldozer-2.c View File

@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 3
- 3
kernel/x86_64/cdot_microk_haswell-2.c View File

@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/cdot_microk_sandy-2.c View File

@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 7
- 7
kernel/x86_64/cdot_microk_steamroller-2.c View File

@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovups %%xmm4, 16(%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 16
- 16
kernel/x86_64/cscal_microk_bulldozer-2.c View File

@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",


+ 15
- 15
kernel/x86_64/cscal_microk_haswell-2.c View File

@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", // "0", "1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)

"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
:
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",


+ 16
- 16
kernel/x86_64/cscal_microk_steamroller-2.c View File

@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)

"vzeroupper \n\t"

:
"+r" (n), // 0
"+r" (x) // 1
:
:
"r" (n), // 0
"r" (x), // 1
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"%0", "%1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n), // 0
"r" (x), // 1
"+r" (n), // 0
"+r" (x) // 1
:
"r" (alpha) // 2
: "cc", //"0", "1",
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
"%xmm8", "%xmm9", "%xmm10", "%xmm11",


+ 3
- 3
kernel/x86_64/daxpy_microk_bulldozer-2.c View File

@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 4
- 4
kernel/x86_64/daxpy_microk_haswell-2.c View File

@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 3
- 3
kernel/x86_64/daxpy_microk_nehalem-2.c View File

@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 8
- 8
kernel/x86_64/daxpy_microk_piledriver-2.c View File

@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 4
- 4
kernel/x86_64/daxpy_microk_sandy-2.c View File

@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)

"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 8
- 8
kernel/x86_64/daxpy_microk_steamroller-2.c View File

@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"subq $16, %1 \n\t"
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 4
- 4
kernel/x86_64/ddot_microk_bulldozer-2.c View File

@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vmovsd %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 3
- 3
kernel/x86_64/ddot_microk_haswell-2.c View File

@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/ddot_microk_nehalem-2.c View File

@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"movsd %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 8
- 8
kernel/x86_64/ddot_microk_piledriver-2.c View File

@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/ddot_microk_sandy-2.c View File

@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/ddot_microk_steamroller-2.c View File

@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovsd %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 5
- 5
kernel/x86_64/dgemv_n_4.c View File

@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t"

:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4


+ 1
- 0
kernel/x86_64/dgemv_n_microk_haswell-4.c View File

@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",


+ 49
- 49
kernel/x86_64/dgemv_n_microk_piledriver-4.c View File

@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastsd (%2), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
"vbroadcastsd (%3), %%ymm12 \n\t" // x0
"vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
"vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
"vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
"vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
"vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
"vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
"vbroadcastsd 56(%3), %%ymm3 \n\t" // x7

"vbroadcastsd (%9), %%ymm6 \n\t" // alpha

"testq $0x04, %1 \n\t"
"jz 2f \n\t"

"vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
"vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"

"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"

"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"

"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"


"vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y

"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"

@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
"vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
"vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
"vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
"vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
"vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
"vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
"vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
"vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
"vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
"vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
"vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
"addq $8 , %0 \n\t"
"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
"vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
"vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
"vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
"vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
"vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
"vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
"vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"

"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"

"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
"subq $8 , %1 \n\t"
"vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
"vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y

"jnz 1b \n\t"

@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",


+ 9
- 9
kernel/x86_64/dgemv_t_4.c View File

@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4


+ 3
- 3
kernel/x86_64/dger_microk_sandy-2.c View File

@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 1
- 1
kernel/x86_64/dscal.c View File

@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t"

:
"+r" (n) // 0
:
"r" (n), // 0
"r" (x), // 1
"r" (x1), // 2
"r" (alpha), // 3


+ 6
- 6
kernel/x86_64/dscal_microk_bulldozer-2.c View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",


+ 6
- 6
kernel/x86_64/dscal_microk_haswell-2.c View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)

"vzeroupper \n\t"

:
"+r" (n1), // 0
"+r" (x) // 1
:
:
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",


+ 6
- 6
kernel/x86_64/dscal_microk_sandy-2.c View File

@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
"vzeroupper \n\t"

:
:
"r" (n1), // 0
"r" (x), // 1
"+r" (n1), // 0
"+r" (x) // 1
:
"r" (alpha), // 2
"r" (n2) // 3
: "cc",
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)

"vzeroupper \n\t"

:
"+r" (n1), // 0
"+r" (x) // 1
:
:
"r" (n1), // 0
"r" (x), // 1
"r" (alpha), // 2
"r" (n2) // 3
: "cc",


+ 2
- 2
kernel/x86_64/dsymv_L_microk_bulldozer-2.c View File

@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2

:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3


+ 2
- 2
kernel/x86_64/dsymv_L_microk_haswell-2.c View File

@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"

:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3


+ 2
- 2
kernel/x86_64/dsymv_L_microk_nehalem-2.c View File

@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2

:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3


+ 2
- 2
kernel/x86_64/dsymv_L_microk_sandy-2.c View File

@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"

:
:
"r" (from), // 0
"+r" (from) // 0
:
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3


+ 3
- 3
kernel/x86_64/dsymv_U_microk_bulldozer-2.c View File

@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4


+ 3
- 3
kernel/x86_64/dsymv_U_microk_haswell-2.c View File

@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4


+ 3
- 3
kernel/x86_64/dsymv_U_microk_nehalem-2.c View File

@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4


+ 3
- 3
kernel/x86_64/dsymv_U_microk_sandy-2.c View File

@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4


+ 49
- 49
kernel/x86_64/dtrsm_kernel_RN_haswell.c View File

@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"

" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1


" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"

" vmovups (%2,%1,4), %%ymm4 \n\t" // read a
" vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"

" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"

" vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"

" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
" vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 22f \n\t"

" vmovups (%2,%1,4), %%ymm0 \n\t" // read a
" vmovups (%8,%1,4), %%ymm0 \n\t" // read a

" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"

" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
" vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
" vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"

" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
" vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
" vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"

@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7

" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
" vmovups (%9), %%ymm0 \n\t"
" vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"

" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
" vmovups 32(%9), %%ymm4 \n\t"
" vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"5: \n\t" // i = 0

" addq $64, %9 \n\t" // b=b+8
" addq $64, %3 \n\t" // b=b+8

" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups %%ymm8 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c

" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"

" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8



" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm9 , (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c

" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"

" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8

" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
" vmovups (%9), %%ymm0 \n\t"
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm10, (%8) \n\t" // write a
" vmovups (%3), %%ymm0 \n\t"
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c

" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"


" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8



" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm11, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c

" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"


" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8


" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm12, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c

" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"

" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8

" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm13, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c

" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"


" addq $64, %9 \n\t" // b=b+8
" addq $32, %8 \n\t" // a=a+8
" addq $64, %3 \n\t" // b=b+8
" addq $32, %2 \n\t" // a=a+8


" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
" vmovups 32(%9), %%ymm1 \n\t"
" vmovups %%ymm14, (%8) \n\t" // write a
" vmovups 32(%3), %%ymm1 \n\t"
" vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c

" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"

" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"

" addq $32, %8 \n\t" // a=a+8
" addq $32, %2 \n\t" // a=a+8

" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
" vmovups %%ymm15, (%8) \n\t" // write a
" vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c

" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
"r" (as), // 8
"r" (bs) // 9
"r" (a), // 8
"r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 48
- 48
kernel/x86_64/dtrsm_kernel_RT_bulldozer.c View File

@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .align 16 \n\t"
"1: \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" prefetcht0 384(%3,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" prefetcht0 384(%7,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

" jz 2f \n\t"

" prefetcht0 384(%2,%1,8) \n\t"
" vmovddup (%3,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%3,%1,2), %%xmm1 \n\t"
" vmovups (%2,%1,8), %%xmm4 \n\t"
" vmovups 16(%2,%1,8), %%xmm5 \n\t"
" vmovups 32(%2,%1,8), %%xmm6 \n\t"
" vmovups 48(%2,%1,8), %%xmm7 \n\t"
" prefetcht0 384(%6,%1,8) \n\t"
" vmovddup (%7,%1,2), %%xmm0 \n\t" // read b
" vmovddup 8(%7,%1,2), %%xmm1 \n\t"
" vmovups (%6,%1,8), %%xmm4 \n\t"
" vmovups 16(%6,%1,8), %%xmm5 \n\t"
" vmovups 32(%6,%1,8), %%xmm6 \n\t"
" vmovups 48(%6,%1,8), %%xmm7 \n\t"

" vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t"
" vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t"
@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON

"3: \n\t" // i = 1

" vmovddup (%7), %%xmm1 \n\t" // read b
" vmovddup 8(%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm1 \n\t" // read b
" vmovddup 8(%3), %%xmm0 \n\t" // read bb

" vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb
" vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb
" vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb
" vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb

" vmovups %%xmm12 , (%6) \n\t" // write a
" vmovups %%xmm13 , 16(%6) \n\t" // write a
" vmovups %%xmm14 , 32(%6) \n\t" // write a
" vmovups %%xmm15 , 48(%6) \n\t" // write a
" vmovups %%xmm12 , (%2) \n\t" // write a
" vmovups %%xmm13 , 16(%2) \n\t" // write a
" vmovups %%xmm14 , 32(%2) \n\t" // write a
" vmovups %%xmm15 , 48(%2) \n\t" // write a

" vmovups %%xmm12 , (%5) \n\t" // write c1
" vmovups %%xmm13 , 16(%5) \n\t"
@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t"

" \n\t" // i = 0
" subq $16 , %7 \n\t" // b = b - 2
" subq $64 , %6 \n\t" // a = a - 8
" subq $16 , %3 \n\t" // b = b - 2
" subq $64 , %2 \n\t" // a = a - 8

" vmovddup (%7), %%xmm0 \n\t" // read bb
" vmovddup (%3), %%xmm0 \n\t" // read bb

" vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb
" vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t"
" vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t"
" vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t"

" vmovups %%xmm8 , (%6) \n\t" // write a
" vmovups %%xmm9 , 16(%6) \n\t"
" vmovups %%xmm10 , 32(%6) \n\t"
" vmovups %%xmm11 , 48(%6) \n\t"
" vmovups %%xmm8 , (%2) \n\t" // write a
" vmovups %%xmm9 , 16(%2) \n\t"
" vmovups %%xmm10 , 32(%2) \n\t"
" vmovups %%xmm11 , 48(%2) \n\t"

" vmovups %%xmm8 , (%4) \n\t" // write c0
" vmovups %%xmm9 , 16(%4) \n\t"
@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vzeroupper \n\t"

:
"+r" (n1), // 0
"+a" (i), // 1
"+r" (as), // 2
"+r" (bs) // 3
:
"r" (n1), // 0
"a" (i), // 1
"r" (a), // 2
"r" (b), // 3
"r" (c), // 4
"r" (c1), // 5
"r" (as), // 6
"r" (bs) // 7
"r" (a), // 6
"r" (b) // 7
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",


+ 1
- 1
kernel/x86_64/gemm_kernel_4x8_nano.S View File

@@ -135,7 +135,7 @@
#endif

movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack

STACK_TOUCHING


+ 1
- 1
kernel/x86_64/gemm_kernel_8x4_sse.S View File

@@ -383,7 +383,7 @@
EMMS

movq %rsp, %rbx # save old stack
subq $128 + LOCAL_BUFFER_SIZE, %rsp
subq $256 + LOCAL_BUFFER_SIZE, %rsp
andq $-4096, %rsp # align stack

STACK_TOUCHING


+ 4
- 4
kernel/x86_64/saxpy_microk_haswell-2.c View File

@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 3
- 3
kernel/x86_64/saxpy_microk_nehalem-2.c View File

@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 8
- 8
kernel/x86_64/saxpy_microk_piledriver-2.c View File

@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4
@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
"jnz 1b \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 4
- 4
kernel/x86_64/saxpy_microk_sandy-2.c View File

@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)

"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (alpha) // 4


+ 4
- 4
kernel/x86_64/sdot_microk_bulldozer-2.c View File

@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vmovss %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/sdot_microk_haswell-2.c View File

@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/sdot_microk_nehalem-2.c View File

@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"movss %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 4
- 4
kernel/x86_64/sdot_microk_sandy-2.c View File

@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
"vmovss %%xmm4, (%4) \n\t"
"vzeroupper \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 8
- 8
kernel/x86_64/sdot_microk_steamroller-2.c View File

@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vmovss %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4
@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)

"vmovss %%xmm4, (%4) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (dot) // 4


+ 7
- 7
kernel/x86_64/sgemv_n_4.c View File

@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a

"3: \n\t"
:
"+r" (i), // 0
"+r" (n1) // 1
:
"r" (i), // 0
"r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t"

:
"+r" (i), // 0
"+r" (n) // 1
:
"r" (i), // 0
"r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",


+ 97
- 97
kernel/x86_64/sgemv_n_microk_bulldozer-4.c View File

@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

__asm__ __volatile__
(
"vbroadcastss (%2), %%xmm12 \n\t" // x0
"vbroadcastss 4(%2), %%xmm13 \n\t" // x1
"vbroadcastss 8(%2), %%xmm14 \n\t" // x2
"vbroadcastss 12(%2), %%xmm15 \n\t" // x3
"vbroadcastss 16(%2), %%xmm0 \n\t" // x4
"vbroadcastss 20(%2), %%xmm1 \n\t" // x5
"vbroadcastss 24(%2), %%xmm2 \n\t" // x6
"vbroadcastss 28(%2), %%xmm3 \n\t" // x7
"vbroadcastss (%3), %%xmm12 \n\t" // x0
"vbroadcastss 4(%3), %%xmm13 \n\t" // x1
"vbroadcastss 8(%3), %%xmm14 \n\t" // x2
"vbroadcastss 12(%3), %%xmm15 \n\t" // x3
"vbroadcastss 16(%3), %%xmm0 \n\t" // x4
"vbroadcastss 20(%3), %%xmm1 \n\t" // x5
"vbroadcastss 24(%3), %%xmm2 \n\t" // x6
"vbroadcastss 28(%3), %%xmm3 \n\t" // x7

"vbroadcastss (%9), %%xmm8 \n\t" // alpha

@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"

"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"

"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %8 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
"vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y

"2: \n\t"

@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"

"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y

"addq $8 , %0 \n\t"
"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"subq $8 , %1 \n\t"


@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"

"prefetcht0 192(%4,%0,4) \n\t"
"vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
"vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
"vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"

"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%4,%8,4) \n\t"
"vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%6,%8,4) \n\t"
"vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
"vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
"vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
"vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
"vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
"vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
"vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
"vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"

"addq $16, %0 \n\t"
"vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
"addq $16, %8 \n\t"
"vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
"addq $16, %2 \n\t"
"vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
"vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y

"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",


+ 62
- 63
kernel/x86_64/sgemv_n_microk_haswell-4.c View File

@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/



#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

@@ -38,41 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7

"vbroadcastss (%9), %%ymm6 \n\t" // alpha

"testq $0x04, %1 \n\t"
"jz 2f \n\t"

"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"

"vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
"vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
"vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
"vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
"vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"

"vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
"vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
"vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
"vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
"vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"

"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"

"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y

"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"

@@ -81,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"

"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"

"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"

"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"

"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"


"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y

"addq $8 , %8 \n\t"
"addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"

@@ -117,35 +116,35 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
"vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
"vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
"vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
"vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
"vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
"vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
"vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
"vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
"vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
"vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
"vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
"vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
"vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
"vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
"vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
"vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"

"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"

"addq $16, %8 \n\t"
"vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
"addq $16, %2 \n\t"
"vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
"vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y

"jnz 1b \n\t"

@@ -154,15 +153,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
@@ -177,7 +176,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}



#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

@@ -196,6 +194,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT

"vbroadcastss (%8), %%ymm6 \n\t" // alpha


"testq $0x04, %1 \n\t"
"jz 2f \n\t"



+ 27
- 27
kernel/x86_64/sgemv_n_microk_nehalem-4.c View File

@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

__asm__ __volatile__
(
"movss (%2), %%xmm12 \n\t" // x0
"movss 4(%2), %%xmm13 \n\t" // x1
"movss 8(%2), %%xmm14 \n\t" // x2
"movss 12(%2), %%xmm15 \n\t" // x3
"movss (%3), %%xmm12 \n\t" // x0
"movss 4(%3), %%xmm13 \n\t" // x1
"movss 8(%3), %%xmm14 \n\t" // x2
"movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"

"movss 16(%2), %%xmm0 \n\t" // x4
"movss 20(%2), %%xmm1 \n\t" // x5
"movss 24(%2), %%xmm2 \n\t" // x6
"movss 28(%2), %%xmm3 \n\t" // x7
"movss 16(%3), %%xmm0 \n\t" // x4
"movss 20(%3), %%xmm1 \n\t" // x5
"movss 24(%3), %%xmm2 \n\t" // x6
"movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
"movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"movups (%4,%0,4), %%xmm7 \n\t" // 4 * y

".p2align 1 \n\t"
"movups (%4,%0,4), %%xmm8 \n\t"
"movups (%5,%0,4), %%xmm9 \n\t"
"movups (%6,%0,4), %%xmm10 \n\t"
"movups (%7,%0,4), %%xmm11 \n\t"
"movups (%5,%0,4), %%xmm8 \n\t"
"movups (%6,%0,4), %%xmm9 \n\t"
"movups (%7,%0,4), %%xmm10 \n\t"
"movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"

"movups (%4,%8,4), %%xmm8 \n\t"
"movups (%5,%8,4), %%xmm9 \n\t"
"movups (%6,%8,4), %%xmm10 \n\t"
"movups (%7,%8,4), %%xmm11 \n\t"
"movups (%5,%2,4), %%xmm8 \n\t"
"movups (%6,%2,4), %%xmm9 \n\t"
"movups (%7,%2,4), %%xmm10 \n\t"
"movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"

"addq $4 , %8 \n\t"
"addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"

"movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
"movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y

"jnz 1b \n\t"

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",


+ 65
- 65
kernel/x86_64/sgemv_n_microk_sandy-4.c View File

@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
"vbroadcastss (%2), %%ymm12 \n\t" // x0
"vbroadcastss 4(%2), %%ymm13 \n\t" // x1
"vbroadcastss 8(%2), %%ymm14 \n\t" // x2
"vbroadcastss 12(%2), %%ymm15 \n\t" // x3
"vbroadcastss 16(%2), %%ymm0 \n\t" // x4
"vbroadcastss 20(%2), %%ymm1 \n\t" // x5
"vbroadcastss 24(%2), %%ymm2 \n\t" // x6
"vbroadcastss 28(%2), %%ymm3 \n\t" // x7
"vbroadcastss (%3), %%ymm12 \n\t" // x0
"vbroadcastss 4(%3), %%ymm13 \n\t" // x1
"vbroadcastss 8(%3), %%ymm14 \n\t" // x2
"vbroadcastss 12(%3), %%ymm15 \n\t" // x3
"vbroadcastss 16(%3), %%ymm0 \n\t" // x4
"vbroadcastss 20(%3), %%ymm1 \n\t" // x5
"vbroadcastss 24(%3), %%ymm2 \n\t" // x6
"vbroadcastss 28(%3), %%ymm3 \n\t" // x7

"vbroadcastss (%9), %%ymm6 \n\t" // alpha

@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
"vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
"vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y

"vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
"vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
"vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
"vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
"vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"

"vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
"vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
"vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
"vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
"vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"

"vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
"vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y

"addq $4, %8 \n\t"
"addq $4, %2 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"

@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
"vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
"vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y

"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"

"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"

"vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y

"addq $8, %8 \n\t"
"addq $8, %2 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"

@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"

"prefetcht0 192(%4,%0,4) \n\t"
"vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
"vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
"vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
"vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
"vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"

"prefetcht0 192(%6,%0,4) \n\t"
"vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
"vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
"vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
"vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%8,%0,4) \n\t"
"vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
"vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"

"prefetcht0 192(%4,%8,4) \n\t"
"vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%5,%8,4) \n\t"
"vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
"prefetcht0 192(%5,%2,4) \n\t"
"vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
"vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
"prefetcht0 192(%6,%2,4) \n\t"
"vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
"vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"

"prefetcht0 192(%6,%8,4) \n\t"
"vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%7,%8,4) \n\t"
"vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
"prefetcht0 192(%7,%2,4) \n\t"
"vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
"vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
"prefetcht0 192(%8,%2,4) \n\t"
"vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
"vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"

"vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
"vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
"vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y

"vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
"vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
"vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y

"addq $16, %8 \n\t"
"addq $16, %2 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO

:
"+r" (i), // 0
"+r" (n) // 1
"+r" (n), // 1
"+r" (lda4) // 2
:
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
"r" (ap[1]), // 5
"r" (ap[2]), // 6
"r" (ap[3]), // 7
"r" (lda4), // 8
"r" (x), // 3
"r" (y), // 4
"r" (ap[0]), // 5
"r" (ap[1]), // 6
"r" (ap[2]), // 7
"r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",


+ 9
- 9
kernel/x86_64/sgemv_t_4.c View File

@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"

:
:
"r" (i), // 0
"r" (n), // 1
"+r" (i), // 0
"+r" (n) // 1
:
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save