| @@ -0,0 +1,103 @@ | |||
| name: continuous build | |||
| on: [push, pull_request] | |||
| jobs: | |||
| build: | |||
| runs-on: ${{ matrix.os }} | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| os: [ubuntu-latest, macos-latest] | |||
| fortran: [gfortran, flang] | |||
| build: [cmake, make] | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v2 | |||
| - name: Compilation cache | |||
| uses: actions/cache@v2 | |||
| with: | |||
| path: ~/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ${{ runner.os }}-ccache-${{ github.sha }} | |||
| # Restore any ccache cache entry, if none for | |||
| # ${{ runner.os }}-ccache-${{ github.sha }} exists | |||
| restore-keys: | | |||
| ${{ runner.os }}-ccache- | |||
| - name: Print system information | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| cat /proc/cpuinfo | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| sysctl -a | grep machdep.cpu | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| - name: Install Dependencies | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get install -y gfortran cmake ccache | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| brew install coreutils cmake ccache | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB | |||
| - name: gfortran build | |||
| if: matrix.build == 'make' && matrix.fortran == 'gfortran' | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 | |||
| - name: flang build | |||
| if: matrix.build == 'make' && matrix.fortran == 'flang' | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| exit 0 | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| cd /usr/ | |||
| sudo wget -nv https://github.com/flang-compiler/flang/releases/download/flang_20190329/flang-20190329-x86-70.tgz | |||
| sudo tar xf flang-20190329-x86-70.tgz | |||
| sudo rm flang-20190329-x86-70.tgz | |||
| cd - | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 FC=flang | |||
| - name: CMake gfortran build | |||
| if: matrix.build == 'cmake' && matrix.fortran == 'gfortran' | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| mkdir build | |||
| cd build | |||
| cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. | |||
| make -j$(nproc) | |||
| @@ -21,6 +21,7 @@ jobs: | |||
| build-OpenBLAS-with-Homebrew: | |||
| runs-on: macos-latest | |||
| env: | |||
| DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer | |||
| HOMEBREW_DEVELOPER: "ON" | |||
| HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" | |||
| HOMEBREW_NO_ANALYTICS: "ON" | |||
| @@ -70,6 +70,7 @@ test/SBLAT2.SUMM | |||
| test/SBLAT3.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| @@ -79,6 +80,7 @@ test/dblat3 | |||
| test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/test_shgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| @@ -16,7 +16,6 @@ matrix: | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| @@ -76,6 +75,23 @@ matrix: | |||
| - TARGET_BOX=LINUX32 | |||
| - BTYPE="BINARY=32" | |||
| - os: linux | |||
| arch: ppc64le | |||
| dist: bionic | |||
| compiler: gcc | |||
| before_script: | |||
| - sudo add-apt-repository 'ppa:ubuntu-toolchain-r/test' -y | |||
| - sudo apt-get update | |||
| - sudo apt-get install gcc-9 gfortran-9 -y | |||
| script: | |||
| - make QUIET_MAKE=1 BINARY=64 USE_OPENMP=1 CC=gcc-9 FC=gfortran-9 | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=PPC64LE_LINUX_P9 | |||
| - os: linux | |||
| compiler: gcc | |||
| addons: | |||
| @@ -108,7 +124,6 @@ matrix: | |||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| before_script: *common-before | |||
| script: | |||
| - set -e | |||
| # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
| - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
| @@ -151,7 +166,6 @@ matrix: | |||
| before_script: | |||
| - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||
| script: | |||
| - set -e | |||
| - mkdir build | |||
| - CONFIG=Release | |||
| - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||
| @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 9.dev) | |||
| set(OpenBLAS_PATCH_VERSION 10.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -23,6 +23,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
| option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| @@ -86,10 +87,13 @@ if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| if (NOT DEFINED BUILD_HALF) | |||
| set (BUILD_HALF false) | |||
| endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| set(BUILD_HALF true) | |||
| # set(BUILD_HALF true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -121,7 +125,7 @@ if (BUILD_COMPLEX16) | |||
| list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE | |||
| endif () | |||
| if (BUILD_SINGLE OR BUILD_HALF) | |||
| if (BUILD_HALF) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "HALF") # defines nothing | |||
| endif () | |||
| @@ -229,6 +233,7 @@ if (NOT MSVC AND NOT NOFORTRAN) | |||
| if(NOT NO_CBLAS) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| @@ -244,7 +249,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFIX} STREQUAL "") | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| @@ -353,10 +358,21 @@ endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| @@ -373,11 +389,9 @@ if(NOT NO_LAPACKE) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| endif() | |||
| include(FindPkgConfig QUIET) | |||
| if(PKG_CONFIG_FOUND) | |||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| endif() | |||
| # Install pkg-config files | |||
| configure_file(${PROJECT_SOURCE_DIR}/cmake/openblas.pc.in ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc @ONLY) | |||
| install (FILES ${PROJECT_BINARY_DIR}/openblas${SUFFIX64}.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig/) | |||
| # GNUInstallDirs "DATADIR" wrong here; CMake search path wants "share". | |||
| @@ -180,3 +180,13 @@ In chronological order: | |||
| * [2019-12-23] optimize AVX2 CGEMM and ZGEMM | |||
| * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels | |||
| * [2020-01-07] optimize AVX2 SGEMM and STRMM | |||
| * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR> | |||
| * [2020-04-15] Half-precision GEMM for bfloat16 | |||
| * Marius Hillenbrand <https://github.com/mhillenibm> | |||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | |||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | |||
| * Danfeng Zhang <https://github.com/craft-zhang> | |||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||
| @@ -1,4 +1,77 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.10 | |||
| 14-Jun-2020 | |||
| common: | |||
| * Improved thread locking behaviour in blas_server and parallel getrf | |||
| * Imported bugfix 394 from LAPACK (spurious reference to "XERBL" | |||
| due to overlong lines) | |||
| * Imported bugfix 403 from LAPACK (compile option "recursive" required | |||
| for correctness with Intel and PGI) | |||
| * Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB) | |||
| * Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP) | |||
| * Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that | |||
| could lead to crashes at large matrix sizes | |||
| * Restored internal soname in dynamic libraries on FreeBSD and Dragonfly | |||
| * Added API (openblas_setaffinity) to set the thread affinity on Linux | |||
| * Added initial infrastructure for half-precision floating point | |||
| (bfloat16) support with a generic implementation of SHGEMM | |||
| * Added CMAKE build system support for building the cblas_Xgemm3m | |||
| functions | |||
| * Fixed CMAKE support for building in a path with embedded spaces | |||
| * Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC | |||
| * Fixed GCC version detection in the Makefiles | |||
| * Allowed overriding the names of AR, AS and LD in Makefile builds | |||
| POWER: | |||
| * Fixed big-endian POWER8 ELFv2 builds on FreeBSD | |||
| * Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9 | |||
| * Fixed CMAKE build support for POWER9 | |||
| * fixed a potential race condition in the thread buffer allocation | |||
| * Worked around LAPACK test failures on PPC G4 | |||
| MIPS: | |||
| * Fixed a potential race condition in the thread buffer allocation | |||
| * Added support for MIPS 24K/24KE family based on P5600 kernels | |||
| MIPS64: | |||
| * fixed a potential race condition in the thread buffer allocation | |||
| * Added TARGET=GENERIC | |||
| ARMV7: | |||
| * Fixed a race condition in the thread buffer allocation | |||
| ARMV8: | |||
| * Fixed a race condition in the thread buffer allocation | |||
| * Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA | |||
| * Improved performance of the ThunderX2 DAXPY kernel | |||
| * Added an optimized SGEMM kernel for Cortex A53 | |||
| * Fixed Makefile support for INTERFACE64 (8-byte integer) | |||
| x86_64: | |||
| * Fixed a syntax error in the CMAKE setup for SkylakeX | |||
| * Improved performance of STRSM on Haswell, SkylakeX and Ryzen | |||
| * Improved SGEMM performance on SGEMM for workloads with ldc a | |||
| multiple of 1024 | |||
| * Improved DGEMM performance on Skylake X | |||
| * Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH | |||
| builds created on SkylakeX | |||
| * Removed data alignment requirement in the SSE2 copy kernels | |||
| that could cause spurious crashes | |||
| * Added a workaround for an optimizer bug in AppleClang 11.0.3 | |||
| * Fixed LAPACK test failures due to wrong options for Intel Fortran | |||
| * Fixed compilation and LAPACK test results with recent Flang | |||
| and AMD AOCC | |||
| * Fixed DYNAMIC_ARCH builds with CMAKE on OS X | |||
| * Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max, | |||
| cblas_?sum, cblas_?gemm3m in the shared library on OS | |||
| * Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes | |||
| show the name of an older generation chip supported by the same kernels) | |||
| IBM Z: | |||
| * Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14 | |||
| ==================================================================== | |||
| Version 0.3.9 | |||
| 1-Mar-2020 | |||
| @@ -0,0 +1,9 @@ | |||
| node { | |||
| stage('Checkout') { | |||
| checkout | |||
| } | |||
| stage('Build') { | |||
| sh("make") | |||
| } | |||
| } | |||
| @@ -141,7 +141,7 @@ ifndef NO_FBLAS | |||
| $(MAKE) -C test all | |||
| endif | |||
| $(MAKE) -C utest all | |||
| ifndef NO_CBLAS | |||
| ifneq ($(NO_CBLAS), 1) | |||
| $(MAKE) -C ctest all | |||
| ifeq ($(CPP_THREAD_SAFETY_TEST), 1) | |||
| $(MAKE) -C cpp_thread_test all | |||
| @@ -244,7 +244,7 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapacklib | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) tmglib | |||
| endif | |||
| ifndef NO_LAPACKE | |||
| ifneq ($(NO_LAPACKE), 1) | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) lapackelib | |||
| endif | |||
| endif | |||
| @@ -264,6 +264,7 @@ lapack_prebuild : | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -364,11 +365,12 @@ clean :: | |||
| @$(MAKE) -C kernel clean | |||
| #endif | |||
| @$(MAKE) -C reference clean | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h | |||
| @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h *.so.renamed *.a.renamed *.so.0 | |||
| ifeq ($(OSNAME), Darwin) | |||
| @rm -rf getarch.dSYM getarch_2nd.dSYM | |||
| endif | |||
| @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib | |||
| @rm -f cblas.tmp cblas.tmp2 | |||
| @touch $(NETLIB_LAPACK_DIR)/make.inc | |||
| @$(MAKE) -C $(NETLIB_LAPACK_DIR) clean | |||
| @rm -f $(NETLIB_LAPACK_DIR)/make.inc $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling.h | |||
| @@ -56,6 +56,16 @@ CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| ifeq ($(CORE), THUNDERX3T110) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| else | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| @@ -13,6 +13,14 @@ OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas | |||
| OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake | |||
| OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake | |||
| OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig | |||
| PKG_EXTRALIB := $(EXTRALIB) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PKG_EXTRALIB += -lomp | |||
| else | |||
| PKG_EXTRALIB += -lgomp | |||
| endif | |||
| endif | |||
| .PHONY : install | |||
| .NOTPARALLEL : install | |||
| @@ -45,7 +53,22 @@ install : lib.grd | |||
| ifndef NO_CBLAS | |||
| @echo Generating cblas.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @sed 's/common/openblas_config/g' cblas.h > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| @cp cblas.h cblas.tmp | |||
| ifdef SYMBOLPREFIX | |||
| @sed 's/cblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/openblas[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp | |||
| #change back any openblas_complex_float and double that got hit | |||
| @sed 's/$(SYMBOLPREFIX)openblas_complex_/openblas_complex_/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/goto[^( ]*/$(SYMBOLPREFIX)&/g' cblas.tmp2 > cblas.tmp | |||
| endif | |||
| ifdef SYMBOLSUFFIX | |||
| @sed 's/cblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/openblas[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp | |||
| #change back any openblas_complex_float and double that got hit | |||
| @sed 's/\(openblas_complex_\)\([^ ]*\)$(SYMBOLSUFFIX)/\1\2 /g' cblas.tmp > cblas.tmp2 | |||
| @sed 's/goto[^( ]*/&$(SYMBOLSUFFIX)/g' cblas.tmp2 > cblas.tmp | |||
| endif | |||
| @sed 's/common/openblas_config/g' cblas.tmp > "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/cblas.h" | |||
| endif | |||
| ifneq ($(OSNAME), AIX) | |||
| @@ -132,7 +155,7 @@ endif | |||
| @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'extralib='$(EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" | |||
| @@ -168,4 +191,3 @@ endif | |||
| @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" | |||
| @echo Install OK! | |||
| @@ -9,23 +9,63 @@ else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| COMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mvsx -fno-fast-math | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| CCOMMON_OPT += -mcpu=power8 -mtune=power8 | |||
| else | |||
| CCOMMON_OPT += -mcpu=power9 -mtune=power9 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -O2 -frecursive -fno-fast-math | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| FCOMMON_OPT += -mcpu=power8 -mtune=power8 | |||
| else | |||
| FCOMMON_OPT += -mcpu=power9 -mtune=power9 | |||
| endif | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power9 -mtune=power9 -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math | |||
| else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(OSNAME), AIX) | |||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -O2 -Mrecursive | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -DUSE_OPENMP -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=power8 -mtune=power8 -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -malign-power -fno-fast-math | |||
| CCOMMON_OPT += -DUSE_OPENMP -mp | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| FCOMMON_OPT += -DUSE_OPENMP -fopenmp | |||
| else | |||
| FCOMMON_OPT += -DUSE_OPENMP -mp | |||
| endif | |||
| endif | |||
| @@ -68,6 +108,9 @@ CCOMMON_OPT += -mpowerpc64 -maix64 | |||
| ifeq ($(COMPILER_F77), g77) | |||
| FCOMMON_OPT += -mpowerpc64 -maix64 | |||
| endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| FCOMMON_OPT += -mpowerpc64 -maix64 | |||
| endif | |||
| ifeq ($(COMPILER_F77), xlf) | |||
| FCOMMON_OPT += -q64 | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.9.dev | |||
| VERSION = 0.3.10.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -273,6 +273,9 @@ COMMON_PROF = -pg | |||
| # | |||
| # CPP_THREAD_SAFETY_TEST = 1 | |||
| # If you want to enable the experimental BFLOAT16 support | |||
| # BUILD_HALF = 1 | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -21,8 +21,14 @@ ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| else ifeq ($(ARCH), powerpc64) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), i386) | |||
| override ARCH=x86 | |||
| else ifeq ($(ARCH), armv6) | |||
| override ARCH=arm | |||
| else ifeq ($(ARCH), armv7) | |||
| override ARCH=arm | |||
| else ifeq ($(ARCH), aarch64) | |||
| override ARCH=arm64 | |||
| else ifeq ($(ARCH), zarch) | |||
| @@ -86,6 +92,9 @@ endif | |||
| ifeq ($(TARGET), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -107,6 +116,9 @@ endif | |||
| ifeq ($(TARGET), ARMV8) | |||
| GETARCH_FLAGS := -DFORCE_ARMV7 | |||
| endif | |||
| ifeq ($(TARGET), POWER8) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| endif | |||
| @@ -125,6 +137,9 @@ endif | |||
| ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -266,10 +281,10 @@ endif | |||
| ARFLAGS = | |||
| CPP = $(COMPILER) -E | |||
| AR = $(CROSS_SUFFIX)ar | |||
| AS = $(CROSS_SUFFIX)as | |||
| LD = $(CROSS_SUFFIX)ld | |||
| RANLIB = $(CROSS_SUFFIX)ranlib | |||
| AR ?= $(CROSS_SUFFIX)ar | |||
| AS ?= $(CROSS_SUFFIX)as | |||
| LD ?= $(CROSS_SUFFIX)ld | |||
| RANLIB ?= $(CROSS_SUFFIX)ranlib | |||
| NM = $(CROSS_SUFFIX)nm | |||
| DLLWRAP = $(CROSS_SUFFIX)dllwrap | |||
| OBJCOPY = $(CROSS_SUFFIX)objcopy | |||
| @@ -282,6 +297,26 @@ NO_LAPACK = 1 | |||
| override FEXTRALIB = | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| # Note that the behavior of -dumpversion is compile-time-configurable for | |||
| # gcc-7.x and newer. Use -dumpfullversion there | |||
| ifeq ($(GCCVERSIONGTEQ7),1) | |||
| GCCDUMPVERSION_PARAM := -dumpfullversion | |||
| else | |||
| GCCDUMPVERSION_PARAM := -dumpversion | |||
| endif | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
| endif | |||
| # | |||
| # OS dependent settings | |||
| # | |||
| @@ -328,13 +363,7 @@ ifeq ($(C_COMPILER), CLANG) | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGT4), 1) | |||
| # GCC Major version > 4 | |||
| # It is compatible with MSVC ABI. | |||
| @@ -348,7 +377,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| endif | |||
| endif | |||
| # Ensure the correct stack alignment on Win32 | |||
| # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 | |||
| @@ -540,7 +568,7 @@ DYNAMIC_CORE += HASWELL ZEN | |||
| endif | |||
| ifneq ($(NO_AVX512), 1) | |||
| ifneq ($(NO_AVX2), 1) | |||
| DYNAMIC_CORE += SKYLAKEX | |||
| DYNAMIC_CORE += SKYLAKEX COOPERLAKE | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -565,11 +593,38 @@ DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| DYNAMIC_CORE += EMAG8180 | |||
| DYNAMIC_CORE += THUNDERX3T110 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = Z13 | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| else ifeq ($(GCCVERSIONEQ5), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) | |||
| ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(ZARCH_SUPPORT_Z13), 1) | |||
| DYNAMIC_CORE += Z13 | |||
| else | |||
| $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| DYNAMIC_CORE += Z14 | |||
| else | |||
| $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| @@ -577,14 +632,23 @@ DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| DYNAMIC_CORE += POWER9 | |||
| DYNAMIC_CORE += POWER10 | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| DYNAMIC_CORE += POWER9 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| else ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| endif | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -745,8 +809,19 @@ endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| ifdef BINARY64 | |||
| ifeq ($(ARCH), x86_64) | |||
| CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER8) | |||
| CCOMMON_OPT += -tp pwr8 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| CCOMMON_OPT += -tp pwr9 | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -tp p7 | |||
| endif | |||
| endif | |||
| @@ -765,6 +840,15 @@ endif | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(ARCH), x86_64) | |||
| FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) | |||
| ifeq ($(FLANG_VENDOR),AOCC) | |||
| FCOMMON_OPT += -fno-unroll-loops | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| @@ -860,7 +944,7 @@ ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| FCOMMON_OPT += -recursive | |||
| FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -fopenmp | |||
| endif | |||
| @@ -900,8 +984,19 @@ ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER8) | |||
| FCOMMON_OPT += -tp pwr8 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| FCOMMON_OPT += -tp pwr9 | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -tp p7 | |||
| endif | |||
| FCOMMON_OPT += -Mrecursive | |||
| @@ -1129,6 +1224,10 @@ ifeq ($(USE_TLS), 1) | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| CCOMMON_OPT += -DBUILD_HALF | |||
| endif | |||
| CCOMMON_OPT += -DVERSION=\"$(VERSION)\" | |||
| ifndef SYMBOLPREFIX | |||
| @@ -1155,6 +1254,9 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||
| include $(TOPDIR)/Makefile.$(ARCH) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | |||
| endif | |||
| CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | |||
| ifeq ($(CORE), PPC440) | |||
| @@ -1247,7 +1349,6 @@ endif | |||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| @@ -1354,6 +1455,7 @@ export ARCH | |||
| export CORE | |||
| export LIBCORE | |||
| export __BYTE_ORDER__ | |||
| export ELF_VERSION | |||
| export PGCPATH | |||
| export CONFIG | |||
| export CC | |||
| @@ -1399,6 +1501,7 @@ export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export BUILD_HALF | |||
| export SHGEMM_UNROLL_M | |||
| export SHGEMM_UNROLL_N | |||
| @@ -27,18 +27,54 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), COOPERLAKE) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # cooperlake support was added in 10.1 | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), HASWELL) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX2 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| CCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| # AVX2 support was added in 4.7.0 | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(FC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(FC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7), 11) | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| CCOMMON_OPT += -march=z14 -mzvector | |||
| CCOMMON_OPT += -march=z14 -mzvector -O3 | |||
| FCOMMON_OPT += -march=z14 -mzvector | |||
| endif | |||
| @@ -28,7 +28,8 @@ You can download them from [file hosting on sourceforge.net](https://sourceforge | |||
| ## Installation from Source | |||
| Download from project homepage, https://xianyi.github.com/OpenBLAS/, or check out the code | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. | |||
| using Git from https://github.com/xianyi/OpenBLAS.git. (If you want the most up to date version, be | |||
| sure to use the develop branch - master is several years out of date due to a change of maintainership.) | |||
| Buildtime parameters can be chosen in Makefile.rule, see there for a short description of each option. | |||
| Most can also be given directly on the make or cmake command line. | |||
| @@ -58,6 +59,10 @@ Examples: | |||
| ```sh | |||
| make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A | |||
| ``` | |||
| or same with the newer mips-crosscompiler put out by Loongson that defaults to the 32bit ABI: | |||
| ```sh | |||
| make HOSTCC=gcc CC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gcc -mabi=64' FC='/opt/mips-loongson-gcc7.3-linux-gnu/2019.06-29/bin/mips-linux-gnu-gfortran -mabi=64' TARGET=LOONGSON3A | |||
| ``` | |||
| * On an x86 box, compile this library for a loongson3a CPU with loongcc (based on Open64) compiler: | |||
| ```sh | |||
| @@ -22,6 +22,7 @@ SANDYBRIDGE | |||
| HASWELL | |||
| SKYLAKEX | |||
| ATOM | |||
| COOPERLAKE | |||
| b)AMD CPU: | |||
| ATHLON | |||
| @@ -49,6 +50,7 @@ POWER6 | |||
| POWER7 | |||
| POWER8 | |||
| POWER9 | |||
| POWER10 | |||
| PPCG4 | |||
| PPC970 | |||
| PPC970MP | |||
| @@ -95,6 +97,7 @@ FALKOR | |||
| THUNDERX | |||
| THUNDERX2T99 | |||
| TSV110 | |||
| THUNDERX3T110 | |||
| 9.System Z: | |||
| ZARCH_GENERIC | |||
| @@ -49,6 +49,12 @@ else | |||
| GOTO_LAPACK_TARGETS= | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| GOTO_HALF_TARGETS=shgemm.goto | |||
| else | |||
| GOTO_HALF_TARGETS= | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||
| samin.goto damin.goto camin.goto zamin.goto \ | |||
| smin.goto dmin.goto \ | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sgemm #################################################### | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| endif | |||
| sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) | |||
| ##################################### Sgeev #################################################### | |||
| sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| sgeev.acml : sgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) | |||
| ##################################### Dgeev #################################################### | |||
| dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dgeev.acml : dgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) | |||
| ##################################### Cgeev #################################################### | |||
| cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| cgeev.acml : cgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) | |||
| ##################################### Zgeev #################################################### | |||
| zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| zgeev.acml : zgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) | |||
| ##################################### Sgetri #################################################### | |||
| sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| sgetri.acml : sgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) | |||
| ##################################### Dgetri #################################################### | |||
| dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dgetri.acml : dgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) | |||
| ##################################### Cgetri #################################################### | |||
| cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| cgetri.acml : cgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) | |||
| ##################################### Zgetri #################################################### | |||
| zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| zgetri.acml : zgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c | |||
| zcholesky.$(SUFFIX) : cholesky.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) : gemm.c | |||
| $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| endif | |||
| sgemm.$(SUFFIX) : gemm.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DOUBLE | |||
| #define GEMM BLASFUNC(dgemm) | |||
| #elif defined(HALF) | |||
| #define GEMM BLASFUNC(shgemm) | |||
| #else | |||
| #define GEMM BLASFUNC(sgemm) | |||
| #endif | |||
| @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| IFLOAT *a, *b; | |||
| FLOAT *c; | |||
| FLOAT alpha[] = {1.0, 0.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| char transa = 'N'; | |||
| @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ | |||
| k = to; | |||
| } | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { | |||
| if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { | |||
| if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { | |||
| @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ | |||
| #endif | |||
| for (i = 0; i < m * k * COMPSIZE; i++) { | |||
| a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < k * n * COMPSIZE; i++) { | |||
| b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * n * COMPSIZE; i++) { | |||
| c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #ifdef RETURN_BY_STACK | |||
| DOT (&result , &m, x, &inc_x, y, &inc_y ); | |||
| #else | |||
| result = DOT (&m, x, &inc_x, y, &inc_y ); | |||
| #endif | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| @@ -6,6 +6,7 @@ | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| $hostarch = `uname -p` if ($hostos eq "AIX"); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| $hostarch = "arm" if ($hostarch =~ /^arm.*/); | |||
| $hostarch = "arm64" if ($hostarch eq "aarch64"); | |||
| @@ -248,6 +249,28 @@ if (($architecture eq "x86") || ($architecture eq "x86_64")) { | |||
| } | |||
| } | |||
| $c11_atomics = 0; | |||
| if ($data =~ /HAVE_C11/) { | |||
| eval "use File::Temp qw(tempfile)"; | |||
| if ($@){ | |||
| warn "could not load PERL module File::Temp, so could not check compiler compatibility with C11"; | |||
| $c11_atomics = 0; | |||
| } else { | |||
| ($fh,$tmpf) = tempfile( SUFFIX => '.c' , UNLINK => 1 ); | |||
| print $tmpf "#include <stdatomic.h>\nint main(void){}\n"; | |||
| $args = " -c -o $tmpf.o $tmpf"; | |||
| my @cmd = ("$compiler_name $flags $args >/dev/null 2>/dev/null"); | |||
| system(@cmd) == 0; | |||
| if ($? != 0) { | |||
| $c11_atomics = 0; | |||
| } else { | |||
| $c11_atomics = 1; | |||
| } | |||
| unlink("$tmpf.o"); | |||
| } | |||
| } | |||
| $data = `$compiler_name $flags -S ctest1.c && grep globl ctest1.s | head -n 1 && rm -f ctest1.s`; | |||
| $data =~ /globl\s([_\.]*)(.*)/; | |||
| @@ -310,6 +333,7 @@ $linker_a = ""; | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /[0-9]+/) | |||
| ) { | |||
| $linker_l .= $flags . " " | |||
| } | |||
| @@ -350,6 +374,8 @@ print CONFFILE "#define __32BIT__\t1\n" if $binformat eq bin32; | |||
| print CONFFILE "#define __64BIT__\t1\n" if $binformat eq bin64; | |||
| print CONFFILE "#define FUNDERSCORE\t$need_fu\n" if $need_fu ne ""; | |||
| print CONFFILE "#define HAVE_MSA\t1\n" if $have_msa eq 1; | |||
| print CONFFILE "#define HAVE_C11\t1\n" if $c11_atomics eq 1; | |||
| if ($os eq "LINUX") { | |||
| @@ -45,11 +45,11 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| endif () | |||
| if (POWER) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | |||
| endif () | |||
| if (X86) | |||
| @@ -76,9 +76,9 @@ if (DYNAMIC_ARCH) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) | |||
| endif () | |||
| if (NOT NO_AVX512) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX) | |||
| set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) | |||
| string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") | |||
| endif () | |||
| endif () | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE PRESCOTT ${DYNAMIC_LIST}) | |||
| endif () | |||
| @@ -103,3 +103,16 @@ if (${CORE} STREQUAL "SKYLAKEX") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL "COOPERLAKE") | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| set (CCOMMON_OPT = "${CCOMMON_OPT} -march=cooperlake") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| endif () | |||
| @@ -21,7 +21,15 @@ | |||
| # NEED2UNDERSCORES | |||
| if (NOT NO_LAPACK) | |||
| enable_language(Fortran) | |||
| include(CheckLanguage) | |||
| check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| set (NOFORTRAN 1) | |||
| set (NO_LAPACK 1) | |||
| endif() | |||
| else() | |||
| include(CMakeForceCompiler) | |||
| CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) | |||
| @@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "G77") | |||
| @@ -113,6 +113,7 @@ macro(SetDefaultL1) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| if (BUILD_HALF) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| @@ -131,6 +132,7 @@ macro(SetDefaultL1) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| set(SHSUMKERNEL ../arm/sum.c) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| @@ -179,10 +181,11 @@ macro(SetDefaultL2) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SHGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL3) | |||
| @@ -190,6 +193,7 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SHGEMM_BETA ../generic/gemm_beta.c) | |||
| @@ -201,6 +205,6 @@ macro(SetDefaultL3) | |||
| set(SHGEMMITCOPYOBJ shgemm_itcopy.o) | |||
| set(SHGEMMONCOPYOBJ shgemm_oncopy.o) | |||
| set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -7,5 +7,5 @@ Name: OpenBLAS | |||
| Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version | |||
| Version: @OPENBLAS_VERSION@ | |||
| URL: https://github.com/xianyi/OpenBLAS | |||
| Libs: -L${libdir} -lopenblas${libsuffix} | |||
| Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} | |||
| Cflags: -I${includedir} | |||
| @@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") | |||
| set(EXTRALIB "${EXTRALIB} -lm") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| @@ -195,8 +195,13 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| if ("${TCORE}" STREQUAL "CORTEXA57") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| else () | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 8) | |||
| endif () | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| @@ -338,6 +343,33 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "THUNDERX3T110") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define THUNDERX3T110\n" | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t8\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t8\n" | |||
| "#define L2_SIZE\t524288\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define L3_SIZE\t94371840\n" | |||
| "#define L3_LINESIZE\t64\n" | |||
| "#define L3_ASSOCIATIVE\t32\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "TSV110") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ARMV8\n" | |||
| @@ -420,7 +452,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| elseif ("${TCORE}" STREQUAL "POWER9") | |||
| elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| "#define L1_DATA_LINESIZE 128\n" | |||
| @@ -492,7 +524,7 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||
| SOURCES ${GETARCH_SRC} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| @@ -520,7 +552,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| OUTPUT_VARIABLE GETARCH2_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||
| ) | |||
| @@ -33,7 +33,7 @@ endif () | |||
| if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| set(NO_AVX 1) | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX") | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") | |||
| set(TARGET "NEHALEM") | |||
| endif () | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| @@ -45,6 +45,18 @@ if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| endif () | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL "COOPERLAKE" AND NOT NO_AVX512) | |||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| # endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL "SKYLAKEX" AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| @@ -297,6 +309,16 @@ if (USE_SIMPLE_THREADED_LEVEL3) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") | |||
| endif () | |||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| if (DEFINED MAX_STACK_ALLOC) | |||
| if (NOT ${MAX_STACK_ALLOC} EQUAL 0) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") | |||
| endif () | |||
| else () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") | |||
| endif () | |||
| endif () | |||
| if (DEFINED LIBNAMESUFFIX) | |||
| set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") | |||
| else () | |||
| @@ -407,6 +429,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows | |||
| set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") | |||
| endif () | |||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (NOT DEFINED SUFFIX) | |||
| set(SUFFIX o) | |||
| endif () | |||
| @@ -116,3 +116,10 @@ set (CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif() | |||
| file(REMOVE "avx512.c" "avx512.o") | |||
| endif() | |||
| include(CheckIncludeFile) | |||
| CHECK_INCLUDE_FILE("stdatomic.h" HAVE_C11) | |||
| if (HAVE_C11 EQUAL 1) | |||
| message (STATUS found stdatomic.h) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_C11") | |||
| endif() | |||
| @@ -15,12 +15,36 @@ endfunction () | |||
| # Reads a Makefile into CMake vars. | |||
| macro(ParseMakefileVars MAKEFILE_IN) | |||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| file(STRINGS ${MAKEFILE_IN} makefile_contents) | |||
| foreach (makefile_line ${makefile_contents}) | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| if (${IfElse} GREATER 0) | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| continue () | |||
| endif() | |||
| if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| # message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on ${line_match}") | |||
| set(var_name ${CMAKE_MATCH_1}) | |||
| set(var_value ${CMAKE_MATCH_2}) | |||
| # set(var_value ${CMAKE_MATCH_2}) | |||
| string(STRIP ${CMAKE_MATCH_2} var_value) | |||
| # check for Makefile variables in the string, e.g. $(TSUFFIX) | |||
| string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | |||
| foreach (make_var ${make_var_matches}) | |||
| @@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| else () | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| else () | |||
| # message(STATUS "unmatched line ${line_match}") | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| else () | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endforeach () | |||
| @@ -360,13 +360,8 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER8 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| #ifdef POWER9 | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| @@ -686,7 +681,7 @@ __declspec(dllimport) int __cdecl omp_in_parallel(void); | |||
| __declspec(dllimport) int __cdecl omp_get_num_procs(void); | |||
| #endif | |||
| #if (__STDC_VERSION__ >= 201112L) | |||
| #ifdef HAVE_C11 | |||
| #if defined(C_GCC) && ( __GNUC__ < 7) | |||
| // workaround for GCC bug 65467 | |||
| #ifndef _Atomic | |||
| @@ -47,12 +47,12 @@ __global__ void cuda_dgemm_kernel(int, int, int, double *, double *, double *); | |||
| extern "C" { | |||
| #endif | |||
| extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||
| void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||
| float * A, BLASLONG strideA, | |||
| float * B, BLASLONG strideB, | |||
| float * R, BLASLONG strideR); | |||
| extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||
| int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||
| int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| @@ -94,7 +94,7 @@ REALNAME: | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #define BUFFER_SIZE (16 << 20) | |||
| #define BUFFER_SIZE (16 << 21) | |||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||
| @@ -227,7 +227,7 @@ REALNAME: ;\ | |||
| #define SEEK_ADDRESS | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define BUFFER_SIZE ( 32 << 21) | |||
| #if defined(LOONGSON3A) | |||
| #define PAGESIZE (16UL << 10) | |||
| @@ -47,7 +47,7 @@ typedef struct { | |||
| int dtb_entries; | |||
| int offsetA, offsetB, align; | |||
| #if 1 | |||
| #ifdef BUILD_HALF | |||
| int shgemm_p, shgemm_q, shgemm_r; | |||
| int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; | |||
| @@ -175,6 +175,11 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| #ifdef ARCH_X86_64 | |||
| void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); | |||
| int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); | |||
| #endif | |||
| int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| @@ -1002,12 +1007,14 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 gotoblas -> exclusive_cache | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P gotoblas -> shgemm_p | |||
| #define SHGEMM_Q gotoblas -> shgemm_q | |||
| #define SHGEMM_R gotoblas -> shgemm_r | |||
| #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m | |||
| #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n | |||
| #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn | |||
| #endif | |||
| #define SGEMM_P gotoblas -> sgemm_p | |||
| #define SGEMM_Q gotoblas -> sgemm_q | |||
| @@ -1086,6 +1093,7 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 0 | |||
| #endif | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P SHGEMM_DEFAULT_P | |||
| #define SHGEMM_Q SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_R SHGEMM_DEFAULT_R | |||
| @@ -1096,6 +1104,7 @@ extern gotoblas_t *gotoblas; | |||
| #else | |||
| #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) | |||
| #endif | |||
| #endif | |||
| #define SGEMM_P SGEMM_DEFAULT_P | |||
| #define SGEMM_Q SGEMM_DEFAULT_Q | |||
| @@ -1330,31 +1339,31 @@ extern gotoblas_t *gotoblas; | |||
| #endif | |||
| #ifndef SHGEMM_DEFAULT_R | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SGEMM_DEFAULT_R | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef DGEMM_DEFAULT_R | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef QGEMM_DEFAULT_R | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef CGEMM_DEFAULT_R | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef ZGEMM_DEFAULT_R | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef XGEMM_DEFAULT_R | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SNUMOPT | |||
| @@ -68,7 +68,7 @@ | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define RMB __asm__ __volatile__ ("eieio":::"memory") | |||
| @@ -105,6 +105,7 @@ static void INLINE blas_lock(volatile unsigned long *address){ | |||
| " bne- 1f\n" | |||
| " stwcx. %2,0, %1\n" | |||
| " bne- 0b\n" | |||
| " isync\n" | |||
| "1: " | |||
| : "=&r"(ret) | |||
| : "r"(address), "r" (val) | |||
| @@ -272,7 +273,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -294,7 +295,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| @@ -843,7 +844,7 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) || defined(POWER9) | |||
| #elif defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| @@ -45,6 +45,10 @@ | |||
| #define SSYMV_THREAD_U ssymv_thread_U | |||
| #define SSYMV_THREAD_L ssymv_thread_L | |||
| #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||
| #define SGEMM_DIRECT sgemm_direct | |||
| #define SGEMM_ONCOPY sgemm_oncopy | |||
| #define SGEMM_OTCOPY sgemm_otcopy | |||
| @@ -214,6 +218,14 @@ | |||
| #define SSYMV_THREAD_U ssymv_thread_U | |||
| #define SSYMV_THREAD_L ssymv_thread_L | |||
| #ifdef ARCH_X86_64 | |||
| #define SGEMM_DIRECT_PERFORMANT gotoblas -> sgemm_direct_performant | |||
| #define SGEMM_DIRECT gotoblas -> sgemm_direct | |||
| #else | |||
| #define SGEMM_DIRECT_PERFORMANT sgemm_direct_performant | |||
| #define SGEMM_DIRECT sgemm_direct | |||
| #endif | |||
| #define SGEMM_ONCOPY gotoblas -> sgemm_oncopy | |||
| #define SGEMM_OTCOPY gotoblas -> sgemm_otcopy | |||
| #define SGEMM_INCOPY gotoblas -> sgemm_incopy | |||
| @@ -132,18 +132,18 @@ extern int blas_server_avail; | |||
| static __inline int num_cpu_avail(int level) { | |||
| #ifdef USE_OPENMP | |||
| int openmp_nthreads=0; | |||
| int openmp_nthreads=omp_get_max_threads(); | |||
| #endif | |||
| #ifndef USE_OPENMP | |||
| if (blas_cpu_number == 1 | |||
| #endif | |||
| #ifdef USE_OPENMP | |||
| || omp_in_parallel() | |||
| if (openmp_nthreads == 1 || omp_in_parallel() | |||
| #endif | |||
| ) return 1; | |||
| ) return 1; | |||
| #ifdef USE_OPENMP | |||
| openmp_nthreads=omp_get_max_threads(); | |||
| if (blas_cpu_number != openmp_nthreads) { | |||
| goto_set_num_threads(openmp_nthreads); | |||
| } | |||
| @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #endif | |||
| do { | |||
| while (*address) {YIELDING;}; | |||
| while (*address) {YIELDING;} | |||
| #ifndef C_MSVC | |||
| __asm__ __volatile__( | |||
| @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ | |||
| #else | |||
| extern unsigned int blas_quick_divide_table[]; | |||
| static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| unsigned int result; | |||
| volatile unsigned int result; | |||
| if (y <= 1) return x; | |||
| @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||
| return result; | |||
| } | |||
| #endif | |||
| @@ -229,14 +228,8 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| #define HUGE_PAGESIZE ( 2 << 20) | |||
| #ifndef BUFFERSIZE | |||
| #if defined(SKYLAKEX) | |||
| #define BUFFER_SIZE (32 << 21) | |||
| #elif defined(HASWELL) || defined(ZEN) | |||
| #define BUFFER_SIZE (32 << 22) | |||
| #else | |||
| #define BUFFER_SIZE (32 << 20) | |||
| #endif | |||
| #else | |||
| #define BUFFER_SIZE (32 << BUFFERSIZE) | |||
| #endif | |||
| @@ -5,6 +5,14 @@ inline void pauser(){ | |||
| std::getline(std::cin, dummy); | |||
| } | |||
| void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { | |||
| if(numConcurrentThreads == 0) { | |||
| std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<<std::endl; | |||
| std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl; | |||
| exit(-1); | |||
| } | |||
| } | |||
| void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ | |||
| for(uint32_t i=0; i<numMat; i++){ | |||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ | |||
| @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||
| std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||
| FailIfThreadsAreZero(numConcurrentThreads); | |||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||
| std::mt19937_64 PRNG = InitPRNG(); | |||
| @@ -18,7 +18,7 @@ int main(int argc, char* argv[]){ | |||
| uint32_t maxHwThreads = omp_get_max_threads(); | |||
| if (maxHwThreads < 52) | |||
| numConcurrentThreads = maxHwThreads -4; | |||
| numConcurrentThreads = maxHwThreads; | |||
| if (argc > 4){ | |||
| std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; | |||
| @@ -47,6 +47,8 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||
| std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||
| FailIfThreadsAreZero(numConcurrentThreads); | |||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||
| std::mt19937_64 PRNG = InitPRNG(); | |||
| @@ -118,6 +118,7 @@ | |||
| #define CORE_ZEN 27 | |||
| #define CORE_SKYLAKEX 28 | |||
| #define CORE_DHYANA 29 | |||
| #define CORE_COOPERLAKE 30 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -137,11 +138,12 @@ | |||
| #define HAVE_MISALIGNSSE (1 << 15) | |||
| #define HAVE_128BITFPU (1 << 16) | |||
| #define HAVE_FASTMOVU (1 << 17) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define HAVE_FMA3 (1 << 20) | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define HAVE_AVX (1 << 18) | |||
| #define HAVE_FMA4 (1 << 19) | |||
| #define HAVE_FMA3 (1 << 20) | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define HAVE_AVX512BF16 (1 << 23) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -218,7 +220,8 @@ typedef struct { | |||
| #define CPUTYPE_ZEN 51 | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #define CPUTYPE_DHYANA 53 | |||
| #define CPUTYPE_COOPERLAKE 54 | |||
| #define CPUTYPE_HYGON_UNKNOWN 54 | |||
| #define CPUTYPE_HYGON_UNKNOWN 99 | |||
| #endif | |||
| @@ -40,6 +40,7 @@ | |||
| // Cavium | |||
| #define CPU_THUNDERX 7 | |||
| #define CPU_THUNDERX2T99 8 | |||
| #define CPU_THUNDERX3T110 12 | |||
| //Hisilicon | |||
| #define CPU_TSV110 9 | |||
| // Ampere | |||
| @@ -57,7 +58,8 @@ static char *cpuname[] = { | |||
| "THUNDERX2T99", | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1" | |||
| "NEOVERSEN1", | |||
| "THUNDERX3T110" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -72,7 +74,8 @@ static char *cpuname_lower[] = { | |||
| "thunderx2t99", | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1" | |||
| "neoversen1", | |||
| "thunderx3t110" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -158,6 +161,8 @@ int detect(void) | |||
| return CPU_THUNDERX; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0af")) | |||
| return CPU_THUNDERX2T99; | |||
| else if (strstr(cpu_implementer, "0x43") && strstr(cpu_part, "0x0b8")) | |||
| return CPU_THUNDERX3T110; | |||
| // HiSilicon | |||
| else if (strstr(cpu_implementer, "0x48") && strstr(cpu_part, "0xd01")) | |||
| return CPU_TSV110; | |||
| @@ -372,7 +377,25 @@ void get_cpuconfig(void) | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -38,6 +38,7 @@ | |||
| #include <sys/utsname.h> | |||
| #ifdef _AIX | |||
| #include <sys/systemcfg.h> | |||
| #include <sys/vminfo.h> | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| @@ -57,6 +58,7 @@ | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| #define CPUTYPE_POWER9 9 | |||
| #define CPUTYPE_POWER10 10 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -68,7 +70,8 @@ char *cpuname[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -81,7 +84,8 @@ char *lowercpuname[] = { | |||
| "cell", | |||
| "ppcg4", | |||
| "power8", | |||
| "power9" | |||
| "power9", | |||
| "power10" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -94,7 +98,8 @@ char *corename[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| int detect(void){ | |||
| @@ -125,6 +130,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -132,34 +138,19 @@ int detect(void){ | |||
| #endif | |||
| #ifdef _AIX | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = popen("prtconf|grep 'Processor Type'", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("Pro", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| // Cast from int to unsigned to ensure comparisons work for all bits in | |||
| // the bit mask, even the top bit | |||
| unsigned implementation = (unsigned) _system_configuration.implementation; | |||
| pclose(infile); | |||
| if (!strncasecmp(p, "POWER3", 6)) return CPUTYPE_POWER3; | |||
| if (!strncasecmp(p, "POWER4", 6)) return CPUTYPE_POWER4; | |||
| if (!strncasecmp(p, "PPC970", 6)) return CPUTYPE_PPC970; | |||
| if (!strncasecmp(p, "POWER5", 6)) return CPUTYPE_POWER5; | |||
| if (!strncasecmp(p, "POWER6", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| if (implementation >= 0x40000u) return CPUTYPE_POWER10; | |||
| else if (implementation & 0x20000) return CPUTYPE_POWER9; | |||
| else if (implementation & 0x10000) return CPUTYPE_POWER8; | |||
| else if (implementation & 0x08000) return CPUTYPE_POWER6; // POWER 7 | |||
| else if (implementation & 0x04000) return CPUTYPE_POWER6; | |||
| else if (implementation & 0x02000) return CPUTYPE_POWER5; | |||
| else if (implementation & 0x01000) return CPUTYPE_POWER4; // MPC7450 | |||
| else if (implementation & 0x00800) return CPUTYPE_POWER4; | |||
| else return CPUTYPE_POWER3; | |||
| #endif | |||
| #ifdef __APPLE__ | |||
| @@ -179,6 +170,9 @@ int detect(void){ | |||
| int id; | |||
| __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x80: // POWER10 | |||
| return CPUTYPE_POWER10; | |||
| break; | |||
| case 0x4e: // POWER9 | |||
| return CPUTYPE_POWER9; | |||
| break; | |||
| @@ -249,6 +249,22 @@ int support_avx512(){ | |||
| #endif | |||
| } | |||
| int support_avx512_bf16(){ | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx512()) | |||
| return 0; | |||
| cpuid_count(7, 1, &eax, &ebx, &ecx, &edx); | |||
| if((eax & 32) == 32){ | |||
| ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| @@ -335,6 +351,7 @@ int get_cputype(int gettype){ | |||
| if (support_avx()) feature |= HAVE_AVX; | |||
| if (support_avx2()) feature |= HAVE_AVX2; | |||
| if (support_avx512()) feature |= HAVE_AVX512VL; | |||
| if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; | |||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
| #endif | |||
| @@ -1337,6 +1354,8 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| case 5: | |||
| // Skylake X | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| @@ -1406,6 +1425,17 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| case 10: //family 6 exmodel 10 | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| @@ -1443,10 +1473,11 @@ int get_cpuname(void){ | |||
| return CPUTYPE_OPTERON; | |||
| case 1: | |||
| case 3: | |||
| case 7: | |||
| case 10: | |||
| // case 7: | |||
| // case 10: | |||
| return CPUTYPE_BARCELONA; | |||
| case 5: | |||
| case 7: | |||
| return CPUTYPE_BOBCAT; | |||
| case 6: | |||
| switch (model) { | |||
| @@ -1496,6 +1527,8 @@ int get_cpuname(void){ | |||
| // AMD Ryzen | |||
| case 8: | |||
| // AMD Ryzen2 | |||
| default: | |||
| // Matisse/Renoir and other recent Ryzen2 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| @@ -1505,6 +1538,16 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| case 10: // Zen3 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CPUTYPE_ZEN; | |||
| #else | |||
| return CPUTYPE_SANDYBRIDGE; // Zen is closer in architecture to Sandy Bridge than to Excavator | |||
| #endif | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| } | |||
| @@ -1653,7 +1696,8 @@ static char *cpuname[] = { | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| "DHYANA", | |||
| "COOPERLAKE" | |||
| }; | |||
| static char *lowercpuname[] = { | |||
| @@ -1709,7 +1753,8 @@ static char *lowercpuname[] = { | |||
| "excavator", | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana" | |||
| "dhyana", | |||
| "cooperlake" | |||
| }; | |||
| static char *corename[] = { | |||
| @@ -1742,7 +1787,8 @@ static char *corename[] = { | |||
| "EXCAVATOR", | |||
| "ZEN", | |||
| "SKYLAKEX", | |||
| "DHYANA" | |||
| "DHYANA", | |||
| "COOPERLAKE" | |||
| }; | |||
| static char *corename_lower[] = { | |||
| @@ -1775,7 +1821,8 @@ static char *corename_lower[] = { | |||
| "excavator", | |||
| "zen", | |||
| "skylakex", | |||
| "dhyana" | |||
| "dhyana", | |||
| "cooperlake" | |||
| }; | |||
| @@ -1955,6 +2002,19 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| case 6: | |||
| @@ -1970,7 +2030,9 @@ int get_coretype(void){ | |||
| case 5: | |||
| // Skylake X | |||
| #ifndef NO_AVX512 | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| @@ -2083,7 +2145,7 @@ int get_coretype(void){ | |||
| return CORE_PILEDRIVER; | |||
| else | |||
| return CORE_BARCELONA; //OS don't support AVX. | |||
| case 5: // New EXCAVATOR | |||
| case 5: // New EXCAVATOR | |||
| if(support_avx()) | |||
| return CORE_EXCAVATOR; | |||
| else | |||
| @@ -2111,12 +2173,14 @@ int get_coretype(void){ | |||
| } | |||
| break; | |||
| } | |||
| } else if (exfamily == 8) { | |||
| } else if (exfamily == 8 || exfamily == 10) { | |||
| switch (model) { | |||
| case 1: | |||
| // AMD Ryzen | |||
| case 8: | |||
| // Ryzen 2 | |||
| // Ryzen 2 | |||
| default: | |||
| // Matisse,Renoir Ryzen2 models | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_ZEN; | |||
| @@ -2237,6 +2301,7 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); | |||
| if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
| if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| @@ -2307,6 +2372,7 @@ void get_sse(void){ | |||
| if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); | |||
| if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
| if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| @@ -153,3 +153,6 @@ ARCH_ARM | |||
| ARCH_ARM64 | |||
| #endif | |||
| #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | |||
| HAVE_C11 | |||
| #endif | |||
| @@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ | |||
| strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ | |||
| @@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( | |||
| COMMONOBJS += syrk_thread.$(SUFFIX) | |||
| ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) | |||
| DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) | |||
| QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) | |||
| @@ -333,7 +333,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #else | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -91,7 +91,7 @@ | |||
| #endif | |||
| typedef struct { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| @@ -67,7 +67,7 @@ | |||
| #endif | |||
| typedef struct { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| _Atomic | |||
| #else | |||
| volatile | |||
| @@ -367,7 +367,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Split local region of B into parts */ | |||
| for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ | |||
| min_jj = MIN(n_to, js + div_n) - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -135,7 +135,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -205,7 +205,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -300,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -370,7 +370,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < ls - js; jjs += min_jj){ | |||
| min_jj = ls - js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | |||
| min_jj = js - ls - min_l - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #ifdef SKYLAKEX | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -47,8 +47,10 @@ endif | |||
| endif | |||
| ifdef USE_CUDA | |||
| ifeq ($(USE_CUDA), 1) | |||
| COMMONOBJS += cuda_init.$(SUFFIX) | |||
| endif | |||
| endif | |||
| ifdef FUNCTION_PROFILE | |||
| COMMONOBJS += profile.$(SUFFIX) | |||
| @@ -141,7 +141,7 @@ typedef struct { | |||
| } thread_status_t; | |||
| #if (__STDC_VERSION__ >= 201112L) | |||
| #ifdef HAVE_C11 | |||
| #define atomic_load_queue(p) __atomic_load_n(p, __ATOMIC_RELAXED) | |||
| #define atomic_store_queue(p, v) __atomic_store_n(p, v, __ATOMIC_RELAXED) | |||
| #else | |||
| @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| int gotoblas_set_affinity(int); | |||
| int gotoblas_set_affinity2(int); | |||
| int get_node(void); | |||
| @@ -281,6 +281,8 @@ int get_node(void); | |||
| static int increased_threads = 0; | |||
| #ifdef OS_LINUX | |||
| extern int openblas_get_num_threads(void); | |||
| int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
| const int active_threads = openblas_get_num_threads(); | |||
| @@ -602,7 +604,7 @@ int blas_thread_init(void){ | |||
| if(ret!=0){ | |||
| struct rlimit rlim; | |||
| const char *msg = strerror(ret); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); | |||
| #ifdef RLIMIT_NPROC | |||
| if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " | |||
| @@ -55,7 +55,7 @@ | |||
| int blas_server_avail = 0; | |||
| static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER]; | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| #else | |||
| static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; | |||
| @@ -320,7 +320,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| while(true) { | |||
| for(i=0; i < MAX_PARALLEL_NUMBER; i++) { | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| _Bool inuse = false; | |||
| if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) { | |||
| #else | |||
| @@ -335,7 +335,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| break; | |||
| } | |||
| #pragma omp parallel for schedule(OMP_SCHED) | |||
| #pragma omp parallel for num_threads(num) schedule(OMP_SCHED) | |||
| for (i = 0; i < num; i ++) { | |||
| #ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| @@ -345,7 +345,7 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| exec_threads(&queue[i], buf_index); | |||
| } | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| atomic_store(&blas_buffer_inuse[buf_index], false); | |||
| #else | |||
| blas_buffer_inuse[buf_index] = false; | |||
| @@ -332,7 +332,7 @@ int support_avx512(){ | |||
| if((ebx & (1<<7)) == 0){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| if((ebx & (1u<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| @@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){ | |||
| cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |||
| if ( (eax & 0xffff) >= 0x01) { | |||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
| if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) | |||
| if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) | |||
| return NULL; | |||
| } | |||
| else | |||
| @@ -644,7 +656,7 @@ static gotoblas_t *get_coretype(void){ | |||
| if ((exfamily == 0) || (exfamily == 2)) { | |||
| if (ecx & (1 << 0)) return &gotoblas_OPTERON_SSE3; | |||
| else return &gotoblas_OPTERON; | |||
| } else if (exfamily == 5) { | |||
| } else if (exfamily == 5 || exfamily == 7) { | |||
| return &gotoblas_BOBCAT; | |||
| } else if (exfamily == 6) { | |||
| if(model == 1){ | |||
| @@ -698,7 +710,7 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| } else if (exfamily == 8) { | |||
| if (model == 1 || model == 8) { | |||
| /* if (model == 1 || model == 8) */ { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| @@ -706,16 +718,24 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 9) { | |||
| } else if (exfamily == 9) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } else if (exfamily == 10) { | |||
| if(support_avx()) | |||
| return &gotoblas_ZEN; | |||
| else{ | |||
| openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); | |||
| return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| }else { | |||
| return &gotoblas_BARCELONA; | |||
| } | |||
| } | |||
| } | |||
| @@ -764,18 +784,53 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; | |||
| if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; | |||
| if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; | |||
| if (gotoblas == &gotoblas_ATOM) return corename[ 6]; | |||
| if (gotoblas == &gotoblas_ATOM) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 6]; | |||
| #else | |||
| return corename[10]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_CORE2) return corename[ 7]; | |||
| if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; | |||
| if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_PENRYN) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 8]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_DUNNINGTON) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 9]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_NEHALEM) return corename[10]; | |||
| if (gotoblas == &gotoblas_ATHLON) return corename[11]; | |||
| if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; | |||
| if (gotoblas == &gotoblas_OPTERON) return corename[13]; | |||
| if (gotoblas == &gotoblas_OPTERON_SSE3) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[12]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_OPTERON) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[13]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_BARCELONA) return corename[14]; | |||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
| if (gotoblas == &gotoblas_NANO) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[15]; | |||
| #else | |||
| return corename[10]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BOBCAT) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[17]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | |||
| @@ -787,6 +842,7 @@ char *gotoblas_corename(void) { | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype){ | |||
| int i ; | |||
| @@ -53,10 +53,11 @@ extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 11 | |||
| #define NUM_CORETYPES 12 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -82,6 +83,7 @@ static char *corename[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "thunderx3t110", | |||
| "unknown" | |||
| }; | |||
| @@ -97,6 +99,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_TSV110) return corename[ 8]; | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -127,6 +130,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 8: return (&gotoblas_TSV110); | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_THUNDERX3T110); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -190,6 +194,8 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_THUNDERX; | |||
| case 0x0af: // ThunderX2 | |||
| return &gotoblas_THUNDERX2T99; | |||
| case 0x0b8: // ThunderX3 | |||
| return &gotoblas_THUNDERX3T110; | |||
| } | |||
| break; | |||
| case 0x48: // HiSilicon | |||
| @@ -6,6 +6,13 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||
| || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| #define HAVE_P10_SUPPORT 1 | |||
| #endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char *msg); | |||
| @@ -13,7 +20,8 @@ static char *corename[] = { | |||
| "unknown", | |||
| "POWER6", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| #define NUM_CORETYPES 4 | |||
| @@ -23,6 +31,9 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| #endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| if (gotoblas == &gotoblas_POWER10) return corename[4]; | |||
| #endif | |||
| return corename[0]; | |||
| } | |||
| @@ -36,6 +47,10 @@ static gotoblas_t *get_coretype(void) { | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (__builtin_cpu_is("power9")) | |||
| return &gotoblas_POWER9; | |||
| #endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) | |||
| return &gotoblas_POWER10; | |||
| #endif | |||
| return NULL; | |||
| } | |||
| @@ -61,6 +76,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| #endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| case 4: return (&gotoblas_POWER10); | |||
| #endif | |||
| default: return NULL; | |||
| } | |||
| @@ -1,12 +1,58 @@ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| // Gate kernels for z13 and z14 on gcc version | |||
| #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ | |||
| /* RHEL 7 since 7.3: */ \ | |||
| (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ | |||
| __GNUC_RH_RELEASE__ >= 11) | |||
| #define HAVE_Z13_SUPPORT | |||
| #endif | |||
| #if __GNUC__ >= 7 | |||
| #define HAVE_Z14_SUPPORT | |||
| #endif | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| extern gotoblas_t gotoblas_Z13; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| extern gotoblas_t gotoblas_Z14; | |||
| //extern gotoblas_t gotoblas_Z15; | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| //extern gotoblas_t gotoblas_Z14; | |||
| //#endif | |||
| #endif | |||
| #define NUM_CORETYPES 4 | |||
| @@ -16,47 +62,50 @@ static char* corename[] = { | |||
| "unknown", | |||
| "Z13", | |||
| "Z14", | |||
| // "Z15", | |||
| "ZARCH_GENERIC", | |||
| }; | |||
| char* gotoblas_corename(void) { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
| // if (gotoblas == &gotoblas_Z15) return corename[3]; | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| // if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| //#endif | |||
| return corename[0]; // try generic? | |||
| #endif | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
| return corename[0]; | |||
| } | |||
| // __builtin_cpu_is is not supported by zarch | |||
| /** | |||
| * Detect the fitting set of kernels by retrieving the CPU features supported by | |||
| * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | |||
| * ("coretype") that exploits most of the features and can be compiled with the | |||
| * available gcc version. | |||
| * Note that we cannot use vector registers on a z13 or newer unless supported | |||
| * by the OS kernel (which needs to handle them properly during context switch). | |||
| */ | |||
| static gotoblas_t* get_coretype(void) { | |||
| FILE* infile; | |||
| char buffer[512], * p; | |||
| p = (char*)NULL; | |||
| infile = fopen("/proc/sysinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) { | |||
| if (!strncmp("Type", buffer, 4)) { | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
| if (strstr(p, "2964")) return &gotoblas_Z13; | |||
| if (strstr(p, "2965")) return &gotoblas_Z13; | |||
| if (strstr(p, "3906")) return &gotoblas_Z14; | |||
| if (strstr(p, "3907")) return &gotoblas_Z14; | |||
| if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 | |||
| if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 | |||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| return &gotoblas_Z14; | |||
| #endif | |||
| // z13: Vector Facility (SIMD for double) | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| if (hwcap & HWCAP_S390_VX) | |||
| return &gotoblas_Z13; | |||
| #endif | |||
| return NULL; // should be ZARCH_GENERIC | |||
| // fallback in case of missing compiler support, systems before z13, or | |||
| // when the OS does not advertise support for the Vector Facility (e.g., | |||
| // missing support in the OS kernel) | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
| switch (found) | |||
| { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| case 1: return (&gotoblas_Z13); | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| case 2: return (&gotoblas_Z14); | |||
| // case 3: return (&gotoblas_Z15); | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| // case 3: return (&gotoblas_POWER9); | |||
| //#endif | |||
| #endif | |||
| case 3: return (&gotoblas_ZARCH_GENERIC); | |||
| default: return NULL; | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to Z14 core\n"); | |||
| snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_Z14; | |||
| gotoblas = &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| @@ -1095,7 +1095,7 @@ static BLASULONG base_address = 0UL; | |||
| static BLASULONG base_address = BASE_ADDRESS; | |||
| #endif | |||
| #if __STDC_VERSION__ >= 201112L | |||
| #ifdef HAVE_C11 | |||
| static _Atomic int memory_initialized = 0; | |||
| #else | |||
| static volatile int memory_initialized = 0; | |||
| @@ -2070,7 +2070,7 @@ if (!release->address) return; | |||
| if (munmap(release -> address, BUFFER_SIZE)) { | |||
| int errsv=errno; | |||
| perror("OpenBLAS : munmap failed:"); | |||
| printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); | |||
| printf("error code=%d,\trelease->address=%p\n",errsv,release->address); | |||
| } | |||
| } | |||
| @@ -180,9 +180,10 @@ int get_L2_size(void){ | |||
| int eax, ebx, ecx, edx; | |||
| #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ | |||
| defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -266,7 +267,9 @@ int get_L2_size(void){ | |||
| void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX) | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ | |||
| defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ | |||
| defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED | |||
| BUILD_LAPACK_DEPRECATED = 0 | |||
| endif | |||
| ifndef BUILD_HALF | |||
| BUILD_HALF = 0 | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifndef ONLY_CBLAS | |||
| @@ -51,6 +55,10 @@ endif | |||
| endif | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| EXTRALIB += -pgf90libs | |||
| endif | |||
| ifneq (,$(filter 1 2,$(NOFORTRAN))) | |||
| FEXTRALIB = | |||
| endif | |||
| @@ -151,8 +159,12 @@ ifeq ($(F_COMPILER), INTEL) | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else ifeq ($(F_COMPILER), FLANG) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else | |||
| ifneq ($(C_COMPILER), LSB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| @@ -234,23 +246,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -30,7 +30,7 @@ | |||
| icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, | |||
| izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, | |||
| scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, | |||
| shgemm, smax,smin,snrm2, | |||
| smax,smin,snrm2, | |||
| srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, | |||
| ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, | |||
| strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, | |||
| @@ -40,17 +40,13 @@ | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
| xerbla, | |||
| saxpby,daxpby,caxpby,zaxpby, | |||
| somatcopy, domatcopy, comatcopy, zomatcopy, | |||
| simatcopy, dimatcopy, cimatcopy, zimatcopy, | |||
| sgeadd,dgeadd,cgeadd,zgeadd, | |||
| somatcopy, | |||
| simatcopy, | |||
| domatcopy, | |||
| dimatcopy, | |||
| comatcopy, | |||
| cimatcopy, | |||
| zomatcopy, | |||
| zimatcopy, | |||
| ssum, dsum, scsum, dzsum | |||
| ); | |||
| @halfblasobjs = (shgemm); | |||
| @cblasobjs = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| @@ -67,7 +63,7 @@ | |||
| cblas_isamax, cblas_izamax, | |||
| cblas_sasum, cblas_saxpy, | |||
| cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, | |||
| cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, | |||
| cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, | |||
| cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, | |||
| cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, | |||
| cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, | |||
| @@ -80,9 +76,16 @@ | |||
| cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, | |||
| cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, | |||
| cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, | |||
| cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, | |||
| cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, | |||
| cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, | |||
| cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, | |||
| cblas_xerbla | |||
| ); | |||
| @halfcblasobjs = (cblas_shgemm); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| qgemv,qger,qmax,qmin, | |||
| @@ -3454,6 +3457,10 @@ use File::Spec; | |||
| use File::Basename; | |||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | |||
| if ($ARGV[12] == 1) { | |||
| @blasobjs = (@blasobjs, @halfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @halfcblasobjs); | |||
| } | |||
| if ($ARGV[8] == 1) { | |||
| #ONLY_CBLAS=1 | |||
| @underscore_objs = (@misc_underscore_objs); | |||
| @@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); | |||
| if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||
| if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||
| if ($ARGV[4] == 0) { | |||
| @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); | |||
| if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| }else{ | |||
| #NO_CBLAS=1 | |||
| @no_underscore_objs = (@misc_no_underscore_objs); | |||
| @@ -82,6 +82,9 @@ if ($compiler eq "") { | |||
| if ($compiler =~ /flang/) { | |||
| $vendor = FLANG; | |||
| $openmp = "-fopenmp"; | |||
| } elsif ($compiler =~ /pgf/) { | |||
| $vendor = PGI; | |||
| $openmp = "-mp"; | |||
| } else { | |||
| $vendor = G77; | |||
| $openmp = ""; | |||
| @@ -334,7 +337,8 @@ if ($link ne "") { | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) | |||
| && ($flags !~ /[0-9]+/) | |||
| && ($flags !~ /^\-l$/) | |||
| ) { | |||
| $linker_l .= $flags . " "; | |||
| @@ -90,11 +90,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include <sys/sysinfo.h> | |||
| #include <unistd.h> | |||
| #endif | |||
| #if defined(AIX) | |||
| #include <sys/sysinfo.h> | |||
| #endif | |||
| #if defined(__x86_64__) || defined(_M_X64) | |||
| #if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX2__)) || (defined(__clang__) && __clang_major__ >= 6)) | |||
| #else | |||
| #define NO_AVX512 | |||
| #endif | |||
| #endif | |||
| /* #define FORCE_P2 */ | |||
| /* #define FORCE_KATMAI */ | |||
| /* #define FORCE_COPPERMINE */ | |||
| @@ -360,6 +365,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_COOPERLAKE | |||
| #ifdef NO_AVX512 | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "HASWELL" | |||
| #define ARCHCONFIG "-DHASWELL " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3" | |||
| #define LIBNAME "haswell" | |||
| #define CORENAME "HASWELL" | |||
| #else | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| #define ARCHITECTURE "X86" | |||
| #define SUBARCHITECTURE "COOPERLAKE" | |||
| #define ARCHCONFIG "-DCOOPERLAKE " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ | |||
| "-DFMA3 -DHAVE_AVX512VL -DHAVE_AVX512BF16 -march=cooperlake" | |||
| #define LIBNAME "cooperlake" | |||
| #define CORENAME "COOPERLAKE" | |||
| #endif | |||
| #endif | |||
| #ifdef FORCE_ATOM | |||
| #define FORCE | |||
| #define FORCE_INTEL | |||
| @@ -650,6 +685,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER9" | |||
| #endif | |||
| #if defined(FORCE_POWER10) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER10" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER10 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power10" | |||
| #define CORENAME "POWER10" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| @@ -1156,6 +1204,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "EMAG8180" | |||
| #endif | |||
| #ifdef FORCE_THUNDERX3T110 | |||
| #define ARMV8 | |||
| #define FORCE | |||
| #define ARCHITECTURE "ARM64" | |||
| #define SUBARCHITECTURE "THUNDERX3T110" | |||
| #define SUBDIRNAME "arm64" | |||
| #define ARCHCONFIG "-DTHUNDERX3T110 " \ | |||
| "-DL1_CODE_SIZE=65536 -DL1_CODE_LINESIZE=64 -DL1_CODE_ASSOCIATIVE=8 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 -DL1_DATA_ASSOCIATIVE=8 " \ | |||
| "-DL2_SIZE=524288 -DL2_LINESIZE=64 -DL2_ASSOCIATIVE=8 " \ | |||
| "-DL3_SIZE=94371840 -DL3_LINESIZE=64 -DL3_ASSOCIATIVE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ | |||
| "-DHAVE_VFPV4 -DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON -DARMV8" | |||
| #define LIBNAME "thunderx3t110" | |||
| #define CORENAME "THUNDERX3T110" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_ZARCH_GENERIC | |||
| #define FORCE | |||
| #define ARCHITECTURE "ZARCH" | |||
| @@ -1284,6 +1350,11 @@ static int get_num_cores(void) { | |||
| sysctl(m, 2, &count, &len, NULL, 0); | |||
| return count; | |||
| #elif defined(AIX) | |||
| //returns the number of processors which are currently online | |||
| return sysconf(_SC_NPROCESSORS_ONLN); | |||
| #else | |||
| return 2; | |||
| #endif | |||
| @@ -1362,10 +1433,12 @@ int main(int argc, char *argv[]){ | |||
| #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); | |||
| #endif | |||
| #if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 | |||
| #elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 | |||
| printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); | |||
| #endif | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| printf("ELF_VERSION=2\n"); | |||
| #endif | |||
| #ifdef MAKE_NB_JOBS | |||
| #if MAKE_NB_JOBS > 0 | |||
| @@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| if (USE_GEMM3M) | |||
| GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| endif() | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX") | |||
| @@ -46,7 +46,9 @@ SBLAS3OBJS = \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLAS3OBJS = shgemm.$(SUFFIX) | |||
| endif | |||
| DBLAS1OBJS = \ | |||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
| @@ -278,7 +280,9 @@ CSBLAS3OBJS = \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) | |||
| endif | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| @@ -363,7 +367,7 @@ CZBLAS3OBJS += cblas_zgemm3m.$(SUFFIX) | |||
| endif | |||
| ifndef NO_CBLAS | |||
| ifneq ($(NO_CBLAS), 1) | |||
| override CFLAGS += -I. | |||
| @@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c | |||
| xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c | |||
| cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -324,8 +324,8 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS | |||
| #ifdef DYNAMIC_ARCH | |||
| if (support_avx512() ) | |||
| #endif | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && sgemm_kernel_direct_performant(m,n,k)) { | |||
| sgemm_kernel_direct(m, n, k, a, lda, b, ldb, c, ldc); | |||
| if (beta == 0 && alpha == 1.0 && order == CblasRowMajor && TransA == CblasNoTrans && TransB == CblasNoTrans && SGEMM_DIRECT_PERFORMANT(m,n,k)) { | |||
| SGEMM_DIRECT(m, n, k, a, lda, b, ldb, c, ldc); | |||
| return; | |||
| } | |||
| @@ -42,7 +42,7 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| @@ -42,7 +42,7 @@ | |||
| #include "functable.h" | |||
| #endif | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) | |||
| #if defined(THUNDERX2T99) || defined(VULCAN) || defined(ARMV8) || defined(THUNDERX3T110) | |||
| // Multithreaded swap gives performance benefits in ThunderX2T99 | |||
| #else | |||
| // Disable multi-threading as it does not show any performance | |||
| @@ -127,17 +127,35 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| # Makefile.L3 | |||
| set(USE_TRMM false) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) OR (TARGET_CORE MATCHES COOPERLAKE)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| set(USE_DIRECT_SGEMM false) | |||
| if (X86_64) | |||
| set(USE_DIRECT_SGEMM true) | |||
| endif() | |||
| if (USE_DIRECT_SGEMM) | |||
| # if (NOT DEFINED SGEMMDIRECTKERNEL) | |||
| set (SGEMMDIRECTKERNEL sgemm_direct_skylakex.c) | |||
| set (SGEMMDIRECTPERFORMANT sgemm_direct_performant.c) | |||
| # endif() | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTKERNEL}" "" "gemm_direct" false "" "" false SINGLE) | |||
| GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) | |||
| endif() | |||
| foreach (float_type SINGLE DOUBLE HALF) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| if (NOT ${BUILD_HALF}) | |||
| continue () | |||
| else () | |||
| set (float_char "SH") | |||
| endif () | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| endforeach() | |||
| @@ -8,8 +8,14 @@ include $(TOPDIR)/Makefile.system | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| override CFLAGS += -fno-integrated-as | |||
| endif | |||
| endif | |||
| AVX2OPT = | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # AVX2 support was added in 4.7.0 | |||
| @@ -32,7 +38,22 @@ ifdef NO_AVX2 | |||
| endif | |||
| ifdef TARGET_CORE | |||
| ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| override CFLAGS += -march=cooperlake | |||
| else | |||
| override CFLAGS += -march=skylake-avx512 | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| else ifeq ($(TARGET_CORE), SKYLAKEX) | |||
| override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) -march=skylake-avx512 | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| override CFLAGS += -fno-asynchronous-unwind-tables | |||
| @@ -9,6 +9,10 @@ ifeq ($(ARCH), x86_64) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| USE_DIRECT_SGEMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), ia64) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| @@ -39,18 +43,28 @@ ifeq ($(CORE), SKYLAKEX) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), COOPERLAKE) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), ZEN) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| ifeq ($(BINARY64),1) | |||
| USE_TRMM = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -59,7 +73,15 @@ ifeq ($(CORE), Z14) | |||
| USE_TRMM = 1 | |||
| endif | |||
| #ifndef SHGEMMKERNEL | |||
| ifdef USE_DIRECT_SGEMM | |||
| ifndef SGEMMDIRECTKERNEL | |||
| SGEMMDIRECTKERNEL = sgemm_direct_skylakex.c | |||
| SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifndef SHGEMMKERNEL | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| @@ -70,18 +92,25 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| #endif | |||
| endif | |||
| SHKERNELOBJS += \ | |||
| shgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ | |||
| $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) | |||
| endif | |||
| SKERNELOBJS += \ | |||
| sgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ | |||
| $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) | |||
| ifdef USE_DIRECT_SGEMM | |||
| SKERNELOBJS += \ | |||
| sgemm_direct$(TSUFFIX).$(SUFFIX) \ | |||
| sgemm_direct_performant$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| DKERNELOBJS += \ | |||
| dgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ | |||
| @@ -110,7 +139,9 @@ XKERNELOBJS += \ | |||
| $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ | |||
| $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += $(SHKERNELOBJS) | |||
| endif | |||
| SBLASOBJS += $(SKERNELOBJS) | |||
| DBLASOBJS += $(DKERNELOBJS) | |||
| QBLASOBJS += $(QKERNELOBJS) | |||
| @@ -118,7 +149,10 @@ CBLASOBJS += $(CKERNELOBJS) | |||
| ZBLASOBJS += $(ZKERNELOBJS) | |||
| XBLASOBJS += $(XKERNELOBJS) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_beta$(TSUFFIX).$(SUFFIX) \ | |||
| strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -461,11 +495,13 @@ ZBLASOBJS += \ | |||
| zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ | |||
| zgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_HALF), 1) | |||
| SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| endif | |||
| SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -491,8 +527,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -512,12 +550,16 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s | |||
| m4 shgemmotcopy.s > shgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ | |||
| rm shgemmotcopy.s shgemmotcopy_nomacros.s | |||
| @@ -532,7 +574,7 @@ $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmitcopy.s | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s | |||
| m4 shgemmitcopy.s > shgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ | |||
| rm shgemmitcopy.s shgemmitcopy_nomacros.s | |||
| @@ -540,6 +582,7 @@ else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| endif | |||
| $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| @@ -547,7 +590,7 @@ $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s | |||
| m4 sgemmotcopy.s > sgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ | |||
| rm sgemmotcopy.s sgemmotcopy_nomacros.s | |||
| @@ -563,7 +606,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) | |||
| $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s | |||
| m4 sgemmitcopy.s > sgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ | |||
| rm sgemmitcopy.s sgemmitcopy_nomacros.s | |||
| @@ -575,7 +618,7 @@ endif | |||
| $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s | |||
| m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ | |||
| rm dgemm_ncopy.s dgemm_ncopy_nomacros.s | |||
| @@ -593,7 +636,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) | |||
| $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s | |||
| m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ | |||
| rm dgemm_itcopy.s dgemm_itcopy_nomacros.s | |||
| @@ -636,7 +679,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) | |||
| $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s | |||
| m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ | |||
| rm cgemm_itcopy.s cgemm_itcopy_nomacros.s | |||
| @@ -659,7 +702,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) | |||
| $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s | |||
| m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ | |||
| rm zgemm_itcopy.s zgemm_itcopy_nomacros.s | |||
| @@ -691,7 +734,7 @@ endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s | |||
| m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s | |||
| @@ -699,19 +742,29 @@ else | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifdef USE_DIRECT_SGEMM | |||
| $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s | |||
| m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s | |||
| m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s | |||
| @@ -724,7 +777,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP | |||
| $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s | |||
| m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ | |||
| rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s | |||
| @@ -734,7 +787,7 @@ endif | |||
| $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s | |||
| m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ | |||
| rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s | |||
| @@ -744,7 +797,7 @@ endif | |||
| $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s | |||
| m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ | |||
| rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s | |||
| @@ -754,7 +807,7 @@ endif | |||
| $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s | |||
| m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ | |||
| rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s | |||
| @@ -764,7 +817,7 @@ endif | |||
| $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s | |||
| m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ | |||
| rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s | |||
| @@ -774,7 +827,7 @@ endif | |||
| $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s | |||
| m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ | |||
| rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s | |||
| @@ -784,7 +837,7 @@ endif | |||
| $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s | |||
| m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ | |||
| rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s | |||
| @@ -794,7 +847,7 @@ endif | |||
| $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s | |||
| m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ | |||
| rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s | |||
| @@ -818,7 +871,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD | |||
| ifdef USE_TRMM | |||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s | |||
| m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ | |||
| rm strmmkernel_ln.s strmmkernel_ln_nomacros.s | |||
| @@ -828,7 +881,7 @@ endif | |||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s | |||
| m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ | |||
| rm strmmkernel_lt.s strmmkernel_lt_nomacros.s | |||
| @@ -838,7 +891,7 @@ endif | |||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s | |||
| m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ | |||
| rm strmmkernel_rn.s strmmkernel_rn_nomacros.s | |||
| @@ -848,7 +901,7 @@ endif | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -858,7 +911,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s | |||
| m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ | |||
| rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s | |||
| @@ -868,7 +921,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s | |||
| m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ | |||
| rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s | |||
| @@ -878,7 +931,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s | |||
| m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ | |||
| rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s | |||
| @@ -888,7 +941,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s | |||
| m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ | |||
| rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s | |||
| @@ -910,7 +963,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s | |||
| m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ | |||
| rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s | |||
| @@ -920,7 +973,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s | |||
| m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s | |||
| @@ -930,7 +983,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s | |||
| m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s | |||
| @@ -940,7 +993,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s | |||
| m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s | |||
| @@ -950,7 +1003,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s | |||
| m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s | |||
| @@ -960,7 +1013,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s | |||
| m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s | |||
| @@ -970,7 +1023,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s | |||
| m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s | |||
| @@ -980,7 +1033,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s | |||
| m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ | |||
| rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s | |||
| @@ -990,7 +1043,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s | |||
| m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ | |||
| rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s | |||
| @@ -1000,7 +1053,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s | |||
| m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s | |||
| @@ -1010,7 +1063,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s | |||
| m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s | |||
| @@ -1020,7 +1073,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s | |||
| m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s | |||
| @@ -1030,7 +1083,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s | |||
| m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s | |||
| @@ -1040,7 +1093,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s | |||
| m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s | |||
| @@ -1050,7 +1103,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s | |||
| m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s | |||
| @@ -1060,7 +1113,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s | |||
| m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s | |||
| @@ -1080,7 +1133,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -1214,7 +1267,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT | |||
| $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s | |||
| m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s | |||
| $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ | |||
| rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s | |||
| @@ -2325,8 +2378,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2343,6 +2398,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2357,6 +2414,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2373,7 +2432,7 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) | |||
| endif | |||
| $(D<GEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) | |||
| $(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||
| $(DGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMOTCOPY) | |||
| @@ -2461,8 +2520,11 @@ endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2481,7 +2543,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM | |||
| $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s | |||
| $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s | |||
| m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ | |||
| rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s | |||
| @@ -2527,7 +2589,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -48,10 +48,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| dot[0]=0.0; | |||
| dot[1]=0.0; | |||
| #if !defined(__PPC__) | |||
| CREAL(result) = 0.0 ; | |||
| CIMAG(result) = 0.0 ; | |||
| #else | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(0.0,0.0); | |||
| #endif | |||
| if ( n < 1 ) return(result); | |||
| inc_x2 = 2 * inc_x ; | |||
| @@ -71,8 +73,12 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| i++ ; | |||
| } | |||
| CREAL(result) = dot[0]; | |||
| #if !defined(__POWER__) | |||
| CREAL(result) = dot[0]; | |||
| CIMAG(result) = dot[1]; | |||
| #else | |||
| result = OPENBLAS_MAKE_COMPLEX_FLOAT(dot[0],dot[1]); | |||
| #endif | |||
| return(result); | |||
| } | |||
| @@ -1,3 +1,187 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| SDOTKERNEL = dot.S | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| else | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| endif | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -0,0 +1,184 @@ | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = daxpy_thunderx2t99.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| SASUMKERNEL = sasum_thunderx2t99.c | |||
| DASUMKERNEL = dasum_thunderx2t99.c | |||
| CASUMKERNEL = casum_thunderx2t99.c | |||
| ZASUMKERNEL = zasum_thunderx2t99.c | |||
| SCOPYKERNEL = copy_thunderx2t99.c | |||
| DCOPYKERNEL = copy_thunderx2t99.c | |||
| CCOPYKERNEL = copy_thunderx2t99.c | |||
| ZCOPYKERNEL = copy_thunderx2t99.c | |||
| SSWAPKERNEL = swap_thunderx2t99.S | |||
| DSWAPKERNEL = swap_thunderx2t99.S | |||
| CSWAPKERNEL = swap_thunderx2t99.S | |||
| ZSWAPKERNEL = swap_thunderx2t99.S | |||
| ISAMAXKERNEL = iamax_thunderx2t99.c | |||
| IDAMAXKERNEL = iamax_thunderx2t99.c | |||
| ICAMAXKERNEL = izamax_thunderx2t99.c | |||
| IZAMAXKERNEL = izamax_thunderx2t99.c | |||
| SNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| CNRM2KERNEL = scnrm2_thunderx2t99.c | |||
| #DNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| #ZNRM2KERNEL = dznrm2_thunderx2t99_fast.c | |||
| DNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| ZNRM2KERNEL = dznrm2_thunderx2t99.c | |||
| DDOTKERNEL = dot_thunderx2t99.c | |||
| SDOTKERNEL = dot_thunderx2t99.c | |||
| CDOTKERNEL = zdot_thunderx2t99.c | |||
| ZDOTKERNEL = zdot_thunderx2t99.c | |||
| DSDOTKERNEL = dot.S | |||
| ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4) | |||
| DGEMMKERNEL = dgemm_kernel_8x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4) | |||
| SGEMMKERNEL = sgemm_kernel_16x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4) | |||
| CGEMMKERNEL = cgemm_kernel_8x4_thunderx2t99.S | |||
| endif | |||
| ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4) | |||
| ZGEMMKERNEL = zgemm_kernel_4x4_thunderx2t99.S | |||
| endif | |||
| @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add X, X, #128 | |||
| .endm | |||
| /* | |||
| * No need to do software prefetches if the vector fits | |||
| * into L1 cache | |||
| */ | |||
| .macro KERNEL_F16_L1CACHE | |||
| ldp q4, q5, [X] | |||
| ldp q16, q17, [Y] | |||
| ldp q6, q7, [X, #32] | |||
| ldp q18, q19, [Y, #32] | |||
| fmla v16.2d, v4.2d, v0.d[0] | |||
| fmla v17.2d, v5.2d, v0.d[0] | |||
| stp q16, q17, [Y] | |||
| ldp q20, q21, [X, #64] | |||
| ldp q24, q25, [Y, #64] | |||
| fmla v18.2d, v6.2d, v0.d[0] | |||
| fmla v19.2d, v7.2d, v0.d[0] | |||
| stp q18, q19, [Y, #32] | |||
| ldp q22, q23, [X, #96] | |||
| ldp q26, q27, [Y, #96] | |||
| fmla v24.2d, v20.2d, v0.d[0] | |||
| fmla v25.2d, v21.2d, v0.d[0] | |||
| stp q24, q25, [Y, #64] | |||
| fmla v26.2d, v22.2d, v0.d[0] | |||
| fmla v27.2d, v23.2d, v0.d[0] | |||
| stp q26, q27, [Y, #96] | |||
| add Y, Y, #128 | |||
| add X, X, #128 | |||
| .endm | |||
| .macro KERNEL_F32 | |||
| KERNEL_F16 | |||
| KERNEL_F16 | |||
| .endm | |||
| .macro KERNEL_F32_L1CACHE | |||
| KERNEL_F16_L1CACHE | |||
| KERNEL_F16_L1CACHE | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #3 | |||
| lsl INC_Y, INC_Y, #3 | |||
| @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp I, xzr | |||
| beq .Ldaxpy_kernel_F1 | |||
| cmp N, #2048 | |||
| ble .Ldaxpy_kernel_F32_L1CACHE | |||
| .align 5 | |||
| .Ldaxpy_kernel_F32: | |||
| @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| subs I, I, #1 | |||
| bne .Ldaxpy_kernel_F32 | |||
| b .Ldaxpy_kernel_F1 | |||
| .align 5 | |||
| .Ldaxpy_kernel_F32_L1CACHE: | |||
| KERNEL_F32_L1CACHE | |||
| subs I, I, #1 | |||
| bne .Ldaxpy_kernel_F32_L1CACHE | |||
| .Ldaxpy_kernel_F1: | |||
| @@ -0,0 +1,562 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define A05 x9 | |||
| #define A06 x10 | |||
| #define A07 x11 | |||
| #define A08 x12 | |||
| #define I x13 | |||
| #define J x14 | |||
| #define K x15 | |||
| #define TEMP1 x16 | |||
| #define TEMP2 x17 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x8 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v10.s[0], v0.s[1] | |||
| ins v12.s[0], v0.s[2] | |||
| ins v14.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v10.s[1], v1.s[1] | |||
| ins v12.s[1], v1.s[2] | |||
| ins v14.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v10.s[2], v2.s[1] | |||
| ins v12.s[2], v2.s[2] | |||
| ins v14.s[2], v2.s[3] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v10.s[3], v3.s[1] | |||
| ins v12.s[3], v3.s[2] | |||
| ins v14.s[3], v3.s[3] | |||
| ldr q4, [A05], #16 | |||
| ldr q5, [A06], #16 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v11.s[0], v4.s[1] | |||
| ins v13.s[0], v4.s[2] | |||
| ins v15.s[0], v4.s[3] | |||
| ins v9.s[1], v5.s[0] | |||
| ins v11.s[1], v5.s[1] | |||
| ins v13.s[1], v5.s[2] | |||
| ins v15.s[1], v5.s[3] | |||
| ldr q6, [A07], #16 | |||
| ldr q7, [A08], #16 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v11.s[2], v6.s[1] | |||
| ins v13.s[2], v6.s[2] | |||
| ins v15.s[2], v6.s[3] | |||
| ins v9.s[3], v7.s[0] | |||
| ins v11.s[3], v7.s[1] | |||
| ins v13.s[3], v7.s[2] | |||
| ins v15.s[3], v7.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v10.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v10.s[1], v1.s[1] | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v10.s[2], v2.s[1] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v10.s[3], v3.s[1] | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v11.s[0], v4.s[1] | |||
| ins v9.s[1], v5.s[0] | |||
| ins v11.s[1], v5.s[1] | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v11.s[2], v6.s[1] | |||
| ins v9.s[3], v7.s[0] | |||
| ins v11.s[3], v7.s[1] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v8.s[3], v3.s[0] | |||
| ldr s4, [A05], #4 | |||
| ldr s5, [A06], #4 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v9.s[1], v5.s[0] | |||
| ldr s6, [A07], #4 | |||
| ldr s7, [A08], #4 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v9.s[3], v7.s[0] | |||
| st1 {v8.4s, v9.4s}, [B00], #32 | |||
| .endm | |||
| .macro COPY4x4 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| st1 {v8.4s, v9.4s}, [B00], #32 | |||
| .endm | |||
| .macro COPY1x4 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v8.s[3], v3.s[0] | |||
| st1 {v8.4s}, [B00], #16 | |||
| .endm | |||
| .macro COPY4x2 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 | |||
| .endm | |||
| .macro COPY2x2 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| st1 {v8.2s, v9.2s}, [B00], #16 | |||
| .endm | |||
| .macro COPY1x2 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| st1 {v8.2s}, [B00], #8 | |||
| .endm | |||
| .macro COPY1x1 | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Lsgemm_ncopy_L8_BEGIN: | |||
| asr J, N, #3 // J = N / 8 | |||
| cmp J, #0 | |||
| ble .Lsgemm_ncopy_L4_BEGIN | |||
| .align 5 | |||
| .Lsgemm_ncopy_L8_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A00, A08, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_40 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A01 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_1: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_1 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A02 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_2: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_2 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A03 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_3: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_3 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A04 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_4: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_4 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A05 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_5: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_5 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A06 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_6: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_6 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A07 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_7: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_7 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A08 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_8: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_8 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L8_M4_20: | |||
| COPY4x8 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L8_M4_20 | |||
| .Lsgemm_ncopy_L8_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_60 | |||
| COPY2x8 | |||
| .Lsgemm_ncopy_L8_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_END | |||
| COPY1x8 | |||
| .Lsgemm_ncopy_L8_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Lsgemm_ncopy_L8_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_ncopy_L4_BEGIN: | |||
| tst N, #7 | |||
| ble .Lsgemm_ncopy_L999 | |||
| tst N, #4 | |||
| ble .Lsgemm_ncopy_L2_BEGIN | |||
| .Lsgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L4_M4_20 | |||
| .Lsgemm_ncopy_L4_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_60 | |||
| COPY2x4 | |||
| .Lsgemm_ncopy_L4_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_END | |||
| COPY1x4 | |||
| .Lsgemm_ncopy_L4_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_ncopy_L2_BEGIN: | |||
| tst N, #2 | |||
| ble .Lsgemm_ncopy_L1_BEGIN | |||
| .Lsgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_ncopy_L2_M4_20 | |||
| .Lsgemm_ncopy_L2_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_60 | |||
| COPY2x2 | |||
| .Lsgemm_ncopy_L2_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_END | |||
| COPY1x2 | |||
| .Lsgemm_ncopy_L2_M4_END: | |||
| .Lsgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Lsgemm_ncopy_L999 | |||
| .Lsgemm_ncopy_L1_M1_BEGIN: | |||
| mov A01, A00 | |||
| mov I, M | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L1_M1_END | |||
| .align 5 | |||
| .Lsgemm_ncopy_L1_M1_20: | |||
| COPY1x1 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L1_M1_20 | |||
| .Lsgemm_ncopy_L1_M1_END: | |||
| .Lsgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,707 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M8 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define A05 x10 | |||
| #define A06 x11 | |||
| #define A07 x12 | |||
| #define A08 x13 | |||
| #define B01 x14 | |||
| #define B02 x15 | |||
| #define B03 x16 | |||
| #define B04 x17 | |||
| #define B00 x22 | |||
| #define I x18 | |||
| #define J x19 | |||
| #define TEMP1 x20 | |||
| #define A_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x8 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| ldp q2, q3, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add TEMP1, B00, #64 | |||
| ldp q4, q5, [A03] | |||
| ldp q6, q7, [A04] | |||
| add A03, A03, #32 | |||
| add A04, A04, #32 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A05] | |||
| ldp q10, q11, [A06] | |||
| add A05, A05, #32 | |||
| add A06, A06, #32 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A07] | |||
| ldp q14, q15, [A08] | |||
| add A07, A07, #32 | |||
| add A08, A08, #32 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| ldr q2, [A03] | |||
| ldr q3, [A04] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| add A03, A03, #16 | |||
| add A04, A04, #16 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] | |||
| add B01, B01, #64 | |||
| ldr q4, [A05] | |||
| ldr q5, [A06] | |||
| ldr q6, [A07] | |||
| ldr q7, [A08] | |||
| add A05, A05, #16 | |||
| add A06, A06, #16 | |||
| add A07, A07, #16 | |||
| add A08, A08, #16 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] | |||
| add B01, B01, #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| ldr d2, [A03] | |||
| ldr d3, [A04] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| add A03, A03, #8 | |||
| add A04, A04, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| stp d2, d3, [B02] | |||
| add B02, B02, #16 | |||
| ldr d4, [A05] | |||
| ldr d5, [A06] | |||
| ldr d6, [A07] | |||
| ldr d7, [A08] | |||
| add A05, A05, #8 | |||
| add A06, A06, #8 | |||
| add A07, A07, #8 | |||
| add A08, A08, #8 | |||
| stp d4, d5, [B02] | |||
| add B02, B02, #16 | |||
| stp d6, d7, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| ldr s2, [A03] | |||
| ldr s3, [A04] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| add A03, A03, #4 | |||
| add A04, A04, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| stp s2, s3, [B03] | |||
| add B03, B03, #8 | |||
| ldr s4, [A05] | |||
| ldr s5, [A06] | |||
| ldr s6, [A07] | |||
| ldr s7, [A08] | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| stp s4, s5, [B03] | |||
| add B03, B03, #8 | |||
| stp s6, s7, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| ldp q2, q3, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add TEMP1, B00, #64 | |||
| ldp q4, q5, [A03] | |||
| ldp q6, q7, [A04] | |||
| add A03, A03, #32 | |||
| add A04, A04, #32 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| ldr q2, [A03] | |||
| ldr q3, [A04] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| add A03, A03, #16 | |||
| add A04, A04, #16 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] | |||
| add B01, B01, #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| ldr d2, [A03] | |||
| ldr d3, [A04] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| add A03, A03, #8 | |||
| add A04, A04, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| stp d2, d3, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| ldr s2, [A03] | |||
| ldr s3, [A04] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| add A03, A03, #4 | |||
| add A04, A04, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| stp s2, s3, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ld1 {v0.4s, v1.4s}, [A01] | |||
| ld1 {v2.4s, v3.4s}, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| stp q0, q1, [B01] | |||
| add B01, B01, #32 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| add A01, A01, #32 | |||
| stp q0, q1, [B00] | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| add A01, A01, #16 | |||
| str q0, [B01] | |||
| add B01, B01, #16 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| add A01, A01, #8 | |||
| str d0, [B02] | |||
| add B02, B02, #8 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| add A01, A01, #4 | |||
| str s0, [B03] | |||
| add B03, B03, #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #2 // TEMP1 = M * SIZE | |||
| and B01 , N , #-8 | |||
| and B02 , N , #-4 | |||
| and B03 , N , #-2 | |||
| mul B01, B01, TEMP1 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| add B01 , B01, B | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| lsl M8, M, #5 // M8 = M * 8 * SIZE | |||
| .Lsgemm_tcopy_L8_BEGIN: | |||
| asr J, M, #3 // J = M / 8 | |||
| cmp J, #0 | |||
| ble .Lsgemm_tcopy_L4_BEGIN | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A, A08, LDA | |||
| mov B00, B | |||
| add B, B00, #256 // B = B + 8 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L8_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L8_M8_20 | |||
| .Lsgemm_tcopy_L8_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L8_M8_60 | |||
| COPY4x8 | |||
| .Lsgemm_tcopy_L8_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L8_M8_80 | |||
| COPY2x8 | |||
| .Lsgemm_tcopy_L8_M8_80: | |||
| tst N, #1 | |||
| ble .Lsgemm_tcopy_L8_M8_END | |||
| COPY1x8 | |||
| .Lsgemm_tcopy_L8_M8_END: | |||
| subs J, J, #1 // j-- | |||
| bne .Lsgemm_tcopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L4_BEGIN: | |||
| tst M, #7 | |||
| ble .Lsgemm_tcopy_L999 | |||
| tst M, #4 | |||
| ble .Lsgemm_tcopy_L2_BEGIN | |||
| .Lsgemm_tcopy_L4_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B00, B | |||
| add B, B00, #128 // B = B + 4 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L4_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L4_M8_20 | |||
| .Lsgemm_tcopy_L4_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L4_M8_60 | |||
| COPY4x4 | |||
| .Lsgemm_tcopy_L4_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L4_M8_80 | |||
| COPY2x4 | |||
| .Lsgemm_tcopy_L4_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L4_M8_END | |||
| COPY1x4 | |||
| .Lsgemm_tcopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble .Lsgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble .Lsgemm_tcopy_L1_BEGIN | |||
| .Lsgemm_tcopy_L2_M16_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B00, B | |||
| add B, B00, #64 // B = B + 2 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L2_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L2_M8_20 | |||
| .Lsgemm_tcopy_L2_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L2_M8_60 | |||
| COPY4x2 | |||
| .Lsgemm_tcopy_L2_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L2_M8_80 | |||
| COPY2x2 | |||
| .Lsgemm_tcopy_L2_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L2_M8_END | |||
| COPY1x2 | |||
| .Lsgemm_tcopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble .Lsgemm_tcopy_L999 | |||
| .Lsgemm_tcopy_L1_M16_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B00, B | |||
| asr I, N, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L1_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L1_M8_20 | |||
| .Lsgemm_tcopy_L1_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L1_M8_60 | |||
| COPY4x1 | |||
| .Lsgemm_tcopy_L1_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L1_M8_80 | |||
| COPY2x1 | |||
| .Lsgemm_tcopy_L1_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L1_M8_END | |||
| COPY1x1 | |||
| .Lsgemm_tcopy_L1_M8_END: | |||
| .Lsgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -39,24 +39,24 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,30 +39,30 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| @@ -39,22 +39,22 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,32 +39,32 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -0,0 +1,225 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = shgemm_kernel_power10.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = sgemm_kernel_power10.c | |||
| DTRMMKERNEL = dgemm_kernel_power10.c | |||
| CTRMMKERNEL = cgemm_kernel_power10.S | |||
| ZTRMMKERNEL = zgemm_kernel_power10.S | |||
| SGEMMKERNEL = sgemm_kernel_power10.c | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_power10.c | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_power10.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| #SAMAXKERNEL = ../arm/amax.c | |||
| #DAMAXKERNEL = ../arm/amax.c | |||
| #CAMAXKERNEL = ../arm/zamax.c | |||
| #ZAMAXKERNEL = ../arm/zamax.c | |||
| # | |||
| #SAMINKERNEL = ../arm/amin.c | |||
| #DAMINKERNEL = ../arm/amin.c | |||
| #CAMINKERNEL = ../arm/zamin.c | |||
| #ZAMINKERNEL = ../arm/zamin.c | |||
| # | |||
| #SMAXKERNEL = ../arm/max.c | |||
| #DMAXKERNEL = ../arm/max.c | |||
| # | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMAXKERNEL = isamax_power9.S | |||
| else | |||
| ISAMAXKERNEL = isamax.c | |||
| endif | |||
| IDAMAXKERNEL = idamax.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMAXKERNEL = icamax_power9.S | |||
| else | |||
| ICAMAXKERNEL = icamax.c | |||
| endif | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMINKERNEL = isamin_power9.S | |||
| else | |||
| ISAMINKERNEL = isamin.c | |||
| endif | |||
| IDAMINKERNEL = idamin.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMINKERNEL = icamin_power9.S | |||
| else | |||
| ICAMINKERNEL = icamin.c | |||
| endif | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| #IDMAXKERNEL = ../arm/imax.c | |||
| # | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CAXPYKERNEL = caxpy_power9.S | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| DCOPYKERNEL = dcopy.c | |||
| CCOPYKERNEL = ccopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| CDOTKERNEL = cdot.c | |||
| endif | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| CROTKERNEL = crot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| DSCALKERNEL = dscal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| # | |||
| SSWAPKERNEL = sswap.c | |||
| DSWAPKERNEL = dswap.c | |||
| CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n_power10.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t_power10.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -1,3 +1,44 @@ | |||
| # Big-endian 32bit (AIX) is supported through the POWER6 GEMM kernels, no separate TRMM | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| SGEMMKERNEL = gemm_kernel_power6.S | |||
| SGEMMINCOPY = | |||
| SGEMMITCOPY = | |||
| SGEMMONCOPY = gemm_ncopy_4.S | |||
| SGEMMOTCOPY = gemm_tcopy_4.S | |||
| SGEMMINCOPYOBJ = | |||
| SGEMMITCOPYOBJ = | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = gemm_kernel_power6.S | |||
| DGEMMINCOPY = | |||
| DGEMMITCOPY = | |||
| DGEMMONCOPY = gemm_ncopy_4.S | |||
| DGEMMOTCOPY = gemm_tcopy_4.S | |||
| DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_power6.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_power6.S | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| @@ -12,7 +53,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -47,16 +88,24 @@ ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| DTRSMKERNEL_LN = trsm_kernel_power6_LN.S | |||
| DTRSMKERNEL_LT = trsm_kernel_power6_LT.S | |||
| DTRSMKERNEL_RN = trsm_kernel_power6_LT.S | |||
| DTRSMKERNEL_RT = trsm_kernel_power6_RT.S | |||
| else | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| endif | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| @@ -153,6 +202,10 @@ ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| # | |||
| ifeq ($(__BYTE_ORDER__)$(BINARY32),__ORDER_BIG_ENDIAN__1) | |||
| CAXPYKERNEL = zaxpy.S | |||
| else | |||
| ifneq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CAXPYKERNEL = caxpy_power8.S | |||
| @@ -162,6 +215,7 @@ endif | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| endif | |||
| # | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| @@ -232,3 +286,10 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) | |||
| IDAMAXKERNEL = ../arm/iamax.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| endif | |||
| @@ -16,7 +16,7 @@ SGEMMKERNEL = sgemm_kernel_power9.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S | |||
| SDOTKERNEL = dot_ppc440.S | |||
| DDOTKERNEL = dot_ppc440.S | |||
| CDOTKERNEL = zdot_ppc440.S | |||
| ZDOTKERNEL = zdot_ppc440.S | |||
| #CDOTKERNEL = zdot_ppc440.S | |||
| #ZDOTKERNEL = zdot_ppc440.S | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| ISAMAXKERNEL = iamax_ppc440.S | |||
| IDAMAXKERNEL = iamax_ppc440.S | |||
| @@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S | |||
| SROTKERNEL = rot_ppc440.S | |||
| DROTKERNEL = rot_ppc440.S | |||
| CROTKERNEL = zrot_ppc440.S | |||
| ZROTKERNEL = zrot_ppc440.S | |||
| #CROTKERNEL = zrot_ppc440.S | |||
| #ZROTKERNEL = zrot_ppc440.S | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SSCALKERNEL = scal_ppc440.S | |||
| DSCALKERNEL = scal_ppc440.S | |||
| @@ -78,13 +83,18 @@ DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_altivec_g4.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| #CGEMMKERNEL = zgemm_kernel_altivec_g4.S | |||
| #CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| #CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMKERNEL = zgemm_kernel.S | |||
| CGEMMINCOPY = | |||
| CGEMMONCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = | |||
| #cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = | |||
| #cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_g4.S | |||
| @@ -46,9 +46,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_16 | |||
| @@ -35,9 +35,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #if defined(__VEC__) || defined(__ALTIVEC__) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| #endif | |||
| #ifndef HAVE_KERNEL_32 | |||
| @@ -23,6 +23,9 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #if !defined(__VEC__) || !defined(__ALTIVEC__) | |||
| #include "../arm/zdot.c" | |||
| #else | |||
| #include "common.h" | |||
| #ifndef HAVE_KERNEL_8 | |||
| @@ -168,3 +171,4 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA | |||
| return (result); | |||
| } | |||
| #endif | |||
| @@ -424,7 +424,7 @@ L999: | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| addi r11, 224 | |||
| addi r11, SP, 224 | |||
| #endif | |||
| lvx v20, r11, r0 | |||
| addi r11, r11, 16 | |||
| @@ -459,4 +459,4 @@ L999: | |||
| blr | |||
| EPILOGUE | |||
| #endif^ | |||
| #endif | |||
| @@ -0,0 +1,286 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs51 | |||
| #define alpha_i vs55 | |||
| #define save_permute_1 vs59 | |||
| #define permute_mask vs63 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power10.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power10.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||