| @@ -190,3 +190,27 @@ steps: | |||
| - make -C ctest $COMMON_FLAGS | |||
| - make -C utest $COMMON_FLAGS | |||
| - make -C cpp_thread_test dgemm_tester | |||
| --- | |||
| kind: pipeline | |||
| name: arm64_gcc10 | |||
| platform: | |||
| os: linux | |||
| arch: arm64 | |||
| steps: | |||
| - name: Build and Test | |||
| image: ubuntu:20.04 | |||
| environment: | |||
| CC: gcc-10 | |||
| FC: gfortran-10 | |||
| COMMON_FLAGS: 'TARGET=ARMV8 DYNAMIC_ARCH=1' | |||
| commands: | |||
| - echo "MAKE_FLAGS:= $COMMON_FLAGS" | |||
| - apt-get update -y | |||
| - apt-get install -y make $CC gfortran-10 perl python g++ | |||
| - $CC --version | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS | |||
| - make -C utest $COMMON_FLAGS | |||
| - make -C test $COMMON_FLAGS | |||
| @@ -43,7 +43,7 @@ jobs: | |||
| - name: Update Homebrew | |||
| if: github.event_name != 'pull_request' | |||
| run: brew update || true | |||
| - name: Install prerequisites | |||
| run: brew install --fetch-HEAD --HEAD --only-dependencies --keep-tmp openblas | |||
| @@ -89,5 +89,7 @@ build.* | |||
| *.swp | |||
| benchmark/*.goto | |||
| benchmark/smallscaling | |||
| .vscode | |||
| CMakeCache.txt | |||
| CMakeFiles/* | |||
| .vscode | |||
| @@ -1,33 +1,38 @@ | |||
| # XXX: Precise is already deprecated, new default is Trusty. | |||
| # https://blog.travis-ci.com/2017-07-11-trusty-as-default-linux-is-coming | |||
| dist: precise | |||
| dist: focal | |||
| sudo: true | |||
| language: c | |||
| matrix: | |||
| include: | |||
| - &test-ubuntu | |||
| os: linux | |||
| # os: linux | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gfortran | |||
| # before_script: &common-before | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| # script: | |||
| # - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # - make -C test $COMMON_FLAGS $BTYPE | |||
| # - make -C ctest $COMMON_FLAGS $BTYPE | |||
| # - make -C utest $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| script: | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| - make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64" | |||
| - <<: *test-ubuntu | |||
| os: linux-ppc64le | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=POWER8 NUM_THREADS=32" | |||
| env: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=PPC64LE_LINUX | |||
| @@ -55,38 +60,38 @@ matrix: | |||
| - TARGET_BOX=IBMZ_LINUX | |||
| - BTYPE="BINARY=64 USE_OPENMP=0 CC=clang" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-ubuntu | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| - <<: *test-ubuntu | |||
| compiler: clang | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 CC=clang" | |||
| - <<: *test-ubuntu | |||
| compiler: clang | |||
| env: | |||
| - TARGET_BOX=LINUX64 | |||
| - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" | |||
| - <<: *test-ubuntu | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gcc-multilib | |||
| - gfortran-multilib | |||
| env: | |||
| - TARGET_BOX=LINUX32 | |||
| - BTYPE="BINARY=32" | |||
| # - <<: *test-ubuntu | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64 INTERFACE64=1" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| # compiler: clang | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64 CC=clang" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| # compiler: clang | |||
| # env: | |||
| # - TARGET_BOX=LINUX64 | |||
| # - BTYPE="BINARY=64 INTERFACE64=1 CC=clang" | |||
| # | |||
| # - <<: *test-ubuntu | |||
| # addons: | |||
| # apt: | |||
| # packages: | |||
| # - gcc-multilib | |||
| # - gfortran-multilib | |||
| # env: | |||
| # - TARGET_BOX=LINUX32 | |||
| # - BTYPE="BINARY=32" | |||
| # | |||
| - os: linux | |||
| arch: ppc64le | |||
| dist: bionic | |||
| @@ -121,47 +126,47 @@ matrix: | |||
| # for matrix annotation only | |||
| - TARGET_BOX=PPC64LE_LINUX_P9 | |||
| - os: linux | |||
| compiler: gcc | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - binutils-mingw-w64-x86-64 | |||
| - gcc-mingw-w64-x86-64 | |||
| - gfortran-mingw-w64-x86-64 | |||
| before_script: *common-before | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=WIN64 | |||
| - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
| # - os: linux | |||
| # compiler: gcc | |||
| # addons: | |||
| # apt: | |||
| # packages: | |||
| # - binutils-mingw-w64-x86-64 | |||
| # - gcc-mingw-w64-x86-64 | |||
| # - gfortran-mingw-w64-x86-64 | |||
| # before_script: *common-before | |||
| # script: | |||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - TARGET_BOX=WIN64 | |||
| # - BTYPE="BINARY=64 HOSTCC=gcc CC=x86_64-w64-mingw32-gcc FC=x86_64-w64-mingw32-gfortran" | |||
| # | |||
| # Build & test on Alpine Linux inside chroot, i.e. on system with musl libc. | |||
| # These jobs needs sudo, so Travis runs them on VM-based infrastructure | |||
| # which is slower than container-based infrastructure used for jobs | |||
| # that don't require sudo. | |||
| - &test-alpine | |||
| os: linux | |||
| dist: trusty | |||
| sudo: true | |||
| language: minimal | |||
| before_install: | |||
| - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||
| && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" | |||
| - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| install: | |||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| before_script: *common-before | |||
| script: | |||
| # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
| - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
| - alpine make -C test $COMMON_FLAGS $BTYPE | |||
| - alpine make -C ctest $COMMON_FLAGS $BTYPE | |||
| - alpine make -C utest $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64" | |||
| # - &test-alpine | |||
| # os: linux | |||
| # dist: trusty | |||
| # sudo: true | |||
| # language: minimal | |||
| # before_install: | |||
| # - "wget 'https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.9.0/alpine-chroot-install' \ | |||
| # && echo 'e5dfbbdc0c4b3363b99334510976c86bfa6cb251 alpine-chroot-install' | sha1sum -c || exit 1" | |||
| # - alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| # install: | |||
| # - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| # before_script: *common-before | |||
| # script: | |||
| # # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
| # - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
| # - alpine make -C test $COMMON_FLAGS $BTYPE | |||
| # - alpine make -C ctest $COMMON_FLAGS $BTYPE | |||
| # - alpine make -C utest $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - TARGET_BOX=LINUX64_MUSL | |||
| # - BTYPE="BINARY=64" | |||
| # XXX: This job segfaults in TESTS OF THE COMPLEX LEVEL 3 BLAS, | |||
| # but only on Travis CI, cannot reproduce it elsewhere. | |||
| @@ -171,89 +176,98 @@ matrix: | |||
| # - TARGET_BOX=LINUX64_MUSL | |||
| # - BTYPE="BINARY=64 USE_OPENMP=1" | |||
| - <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 INTERFACE64=1" | |||
| # - <<: *test-alpine | |||
| # env: | |||
| # - TARGET_BOX=LINUX64_MUSL | |||
| # - BTYPE="BINARY=64 INTERFACE64=1" | |||
| # | |||
| # # Build with the same flags as Alpine do in OpenBLAS package. | |||
| # - <<: *test-alpine | |||
| # env: | |||
| # - TARGET_BOX=LINUX64_MUSL | |||
| # - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" | |||
| # Build with the same flags as Alpine do in OpenBLAS package. | |||
| - <<: *test-alpine | |||
| env: | |||
| - TARGET_BOX=LINUX64_MUSL | |||
| - BTYPE="BINARY=64 NO_AFFINITY=1 USE_OPENMP=0 NO_LAPACK=0 TARGET=CORE2" | |||
| # - &test-cmake | |||
| # os: linux | |||
| # compiler: clang | |||
| # addons: | |||
| # apt: | |||
| # packages: | |||
| # - gfortran | |||
| # - cmake | |||
| # dist: trusty | |||
| # sudo: true | |||
| # before_script: | |||
| # - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||
| # script: | |||
| # - mkdir build | |||
| # - CONFIG=Release | |||
| # - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||
| # - cmake --build build --config $CONFIG -- -j2 | |||
| # env: | |||
| # - CMAKE=1 | |||
| # - <<: *test-cmake | |||
| # env: | |||
| # - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" | |||
| # - <<: *test-cmake | |||
| # compiler: gcc | |||
| # env: | |||
| # - CMAKE=1 | |||
| - &test-cmake | |||
| os: linux | |||
| compiler: clang | |||
| addons: | |||
| apt: | |||
| packages: | |||
| - gfortran | |||
| - cmake | |||
| dist: trusty | |||
| sudo: true | |||
| before_script: | |||
| - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||
| script: | |||
| - mkdir build | |||
| - CONFIG=Release | |||
| - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||
| - cmake --build build --config $CONFIG -- -j2 | |||
| env: | |||
| - CMAKE=1 | |||
| - <<: *test-cmake | |||
| env: | |||
| - CMAKE=1 CMAKE_ARGS="-DNOFORTRAN=1" | |||
| - <<: *test-cmake | |||
| compiler: gcc | |||
| env: | |||
| - CMAKE=1 | |||
| - &test-macos | |||
| os: osx | |||
| osx_image: xcode11.5 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" | |||
| # - &test-macos | |||
| # os: osx | |||
| # osx_image: xcode11.5 | |||
| # before_script: | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| # script: | |||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-9" | |||
| # | |||
| # - <<: *test-macos | |||
| # osx_image: xcode12 | |||
| # before_script: | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| # - brew update | |||
| # script: | |||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" | |||
| # | |||
| # - <<: *test-macos | |||
| # osx_image: xcode12 | |||
| # before_script: | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| # - brew update | |||
| # script: | |||
| # - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| # env: | |||
| # - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" | |||
| - <<: *test-macos | |||
| osx_image: xcode12 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| - brew update | |||
| - brew install gcc@10 | |||
| script: | |||
| - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| env: | |||
| - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" | |||
| # - <<: *test-macos | |||
| # osx_image: xcode10 | |||
| # env: | |||
| # - BTYPE="TARGET=NEHALEM BINARY=32 NOFORTRAN=1" | |||
| - <<: *test-macos | |||
| osx_image: xcode11.5 | |||
| before_script: | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| - brew update | |||
| env: | |||
| # - <<: *test-macos | |||
| # osx_image: xcode11.5 | |||
| # before_script: | |||
| # - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" | |||
| # - brew update | |||
| # env: | |||
| # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" | |||
| - <<: *test-macos | |||
| osx_image: xcode11.5 | |||
| env: | |||
| # - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" | |||
| - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" | |||
| - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" | |||
| # - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| # - CFLAGS="-O2 -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch arm64 -miphoneos-version-min=10.0" | |||
| # - BTYPE="TARGET=ARMV8 BINARY=64 HOSTCC=clang NOFORTRAN=1" | |||
| # - <<: *test-macos | |||
| # osx_image: xcode11.5 | |||
| # env: | |||
| ## - CC="/Applications/Xcode-10.1.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| ## - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-10.1.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS12.1.sdk -arch armv7 -miphoneos-version-min=5.1" | |||
| # - CC="/Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang" | |||
| # - CFLAGS="-O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode-11.5.GM.Seed.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS13.5.sdk -arch armv7 -miphoneos-version-min=5.1" | |||
| # - BTYPE="TARGET=ARMV7 HOSTCC=clang NOFORTRAN=1" | |||
| - &test-graviton2 | |||
| os: linux | |||
| @@ -3,10 +3,13 @@ | |||
| ## | |||
| cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 12.dev) | |||
| set(OpenBLAS_PATCH_VERSION 20) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -14,54 +17,74 @@ include(GNUInstallDirs) | |||
| include(CMakePackageConfigHelpers) | |||
| if(MSVC AND NOT DEFINED NOFORTRAN) | |||
| set(NOFORTRAN ON) | |||
| endif() | |||
| ####### | |||
| if(MSVC) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS)" ON) | |||
| endif() | |||
| option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
| option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| set(NO_AFFINITY 1) | |||
| set(NO_AFFINITY 1) | |||
| endif() | |||
| option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) | |||
| option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) | |||
| option(BUILD_STATIC_LIBS "Build static library" OFF) | |||
| if(NOT BUILD_STATIC_LIBS AND NOT BUILD_SHARED_LIBS) | |||
| set(BUILD_STATIC_LIBS ON CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| if((BUILD_STATIC_LIBS AND BUILD_SHARED_LIBS) AND MSVC) | |||
| message(WARNING "Could not enable both BUILD_STATIC_LIBS and BUILD_SHARED_LIBS with MSVC, Disable BUILD_SHARED_LIBS") | |||
| set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static library" FORCE) | |||
| endif() | |||
| # Add a prefix or suffix to all exported symbol names in the shared library. | |||
| # Avoids conflicts with other BLAS libraries, especially when using | |||
| # 64 bit integer interfaces in OpenBLAS. | |||
| set(SYMBOLPREFIX "" CACHE STRING "Add a prefix to all exported symbol names in the shared library to avoid conflicts with other BLAS libraries" ) | |||
| set(SYMBOLSUFFIX "" CACHE STRING "Add a suffix to all exported symbol names in the shared library, e.g. _64 for INTERFACE64 builds" ) | |||
| ####### | |||
| if(BUILD_WITHOUT_LAPACK) | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| set(NO_LAPACK 1) | |||
| set(NO_LAPACKE 1) | |||
| endif() | |||
| if(BUILD_WITHOUT_CBLAS) | |||
| set(NO_CBLAS 1) | |||
| set(NO_CBLAS 1) | |||
| endif() | |||
| ####### | |||
| if(MSVC AND MSVC_STATIC_CRT) | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| set(CompilerFlags | |||
| CMAKE_CXX_FLAGS | |||
| CMAKE_CXX_FLAGS_DEBUG | |||
| CMAKE_CXX_FLAGS_RELEASE | |||
| CMAKE_C_FLAGS | |||
| CMAKE_C_FLAGS_DEBUG | |||
| CMAKE_C_FLAGS_RELEASE | |||
| ) | |||
| foreach(CompilerFlag ${CompilerFlags}) | |||
| string(REPLACE "/MD" "/MT" ${CompilerFlag} "${${CompilerFlag}}") | |||
| endforeach() | |||
| endif() | |||
| message(WARNING "CMake support is experimental. It does not yet support all build options and may not produce the same Makefiles that OpenBLAS ships with.") | |||
| @@ -95,7 +118,7 @@ endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| # set(BUILD_BFLOAT16 true) | |||
| # set(BUILD_BFLOAT16 true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -129,7 +152,7 @@ endif () | |||
| if (BUILD_BFLOAT16) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||
| # list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing | |||
| endif () | |||
| if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | |||
| @@ -140,9 +163,10 @@ endif () | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| if(MSVC) | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
| set( CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib/Debug) | |||
| set( CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib/Release) | |||
| endif () | |||
| # get obj vars into format that add_library likes: $<TARGET_OBJS:objlib> (see http://www.cmake.org/cmake/help/v3.0/command/add_library.html) | |||
| set(TARGET_OBJS "") | |||
| foreach (SUBDIR ${SUBDIRS}) | |||
| @@ -180,12 +204,63 @@ if (${DYNAMIC_ARCH}) | |||
| endif () | |||
| # add objects to the openblas lib | |||
| add_library(${OpenBLAS_LIBNAME} ${LA_SOURCES} ${LAPACKE_SOURCES} ${RELA_SOURCES} ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME} INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| if(NOT NO_LAPACK) | |||
| add_library(LAPACK OBJECT ${LA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACK>") | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| add_library(LAPACKE OBJECT ${LAPACKE_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:LAPACKE>") | |||
| endif() | |||
| if(BUILD_RELAPACK) | |||
| add_library(RELAPACK OBJECT ${RELA_SOURCES}) | |||
| list(APPEND TARGET_OBJS "$<TARGET_OBJECTS:RELAPACK>") | |||
| endif() | |||
| set(OpenBLAS_LIBS "") | |||
| if(BUILD_STATIC_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME}_static STATIC ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME}_static INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_static) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME}_shared SHARED ${TARGET_OBJS} ${OpenBLAS_DEF_FILE}) | |||
| target_include_directories(${OpenBLAS_LIBNAME}_shared INTERFACE $<INSTALL_INTERFACE:include/openblas${SUFFIX64}>) | |||
| list(APPEND OpenBLAS_LIBS ${OpenBLAS_LIBNAME}_shared) | |||
| endif() | |||
| if(BUILD_STATIC_LIBS) | |||
| add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_static) | |||
| else() | |||
| add_library(${OpenBLAS_LIBNAME} ALIAS ${OpenBLAS_LIBNAME}_shared) | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES OUTPUT_NAME ${OpenBLAS_LIBNAME}) | |||
| # Android needs to explicitly link against libm | |||
| if(ANDROID) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} m) | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static m) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared m) | |||
| endif() | |||
| endif() | |||
| if (APPLE AND DYNAMIC_ARCH AND BUILD_SHARED_LIBS) | |||
| set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| if (NOT NOFORTRAN) | |||
| set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1) | |||
| set (CMAKE_Fortran_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '" | |||
| "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'" | |||
| "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'") | |||
| else () | |||
| set (CMAKE_C_CREATE_SHARED_LIBRARY | |||
| "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ar -ru libopenblas.a && exit 0' " | |||
| "sh -c 'ar -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' " | |||
| "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'") | |||
| endif () | |||
| endif() | |||
| # Handle MSVC exports | |||
| @@ -194,21 +269,21 @@ if(MSVC AND BUILD_SHARED_LIBS) | |||
| include("${PROJECT_SOURCE_DIR}/cmake/export.cmake") | |||
| else() | |||
| # Creates verbose .def file (51KB vs 18KB) | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
| set_target_properties(${OpenBLAS_LIBNAME}_shared PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS true) | |||
| endif() | |||
| endif() | |||
| # Set output for libopenblas | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_NAME_DEBUG "${OpenBLAS_LIBNAME}_d") | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES EXPORT_NAME "OpenBLAS") | |||
| foreach (OUTPUTCONFIG ${CMAKE_CONFIGURATION_TYPES}) | |||
| string( TOUPPER ${OUTPUTCONFIG} OUTPUTCONFIG ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBNAME} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES LIBRARY_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| set_target_properties( ${OpenBLAS_LIBS} PROPERTIES ARCHIVE_OUTPUT_DIRECTORY_${OUTPUTCONFIG} ${PROJECT_BINARY_DIR}/lib/${OUTPUTCONFIG} ) | |||
| endforeach() | |||
| enable_testing() | |||
| @@ -217,10 +292,17 @@ if (USE_THREAD) | |||
| # Add threading library to linker | |||
| find_package(Threads) | |||
| if (THREADS_HAVE_PTHREAD_ARG) | |||
| set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY COMPILE_OPTIONS "-pthread") | |||
| set_property(TARGET ${OpenBLAS_LIBNAME} PROPERTY INTERFACE_COMPILE_OPTIONS "-pthread") | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| COMPILE_OPTIONS "-pthread" | |||
| INTERFACE_COMPILE_OPTIONS "-pthread" | |||
| ) | |||
| endif() | |||
| if(BUILD_STATIC_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_static ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| target_link_libraries(${OpenBLAS_LIBNAME} ${CMAKE_THREAD_LIBS_INIT}) | |||
| endif() | |||
| #if (MSVC OR NOT NOFORTRAN) | |||
| @@ -229,104 +311,116 @@ if (NOT NO_CBLAS) | |||
| add_subdirectory(utest) | |||
| endif() | |||
| if (NOT MSVC AND NOT NOFORTRAN) | |||
| if (NOT NOFORTRAN) | |||
| # Build test and ctest | |||
| add_subdirectory(test) | |||
| if(NOT NO_CBLAS) | |||
| add_subdirectory(ctest) | |||
| endif() | |||
| add_subdirectory(lapack-netlib/TESTING) | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) | |||
| add_subdirectory(cpp_thread_test) | |||
| endif() | |||
| endif() | |||
| set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES | |||
| set_target_properties(${OpenBLAS_LIBS} PROPERTIES | |||
| VERSION ${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION} | |||
| SOVERSION ${OpenBLAS_MAJOR_VERSION} | |||
| ) | |||
| if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
| target_link_libraries(${OpenBLAS_LIBNAME}_shared "-Wl,-allow-multiple-definition") | |||
| else() | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
| endif() | |||
| endif() | |||
| if (BUILD_SHARED_LIBS AND NOT ${SYMBOLPREFIX}${SYMBOLSUFFIX} STREQUAL "") | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| set(ARCH_IN ${ARCH}) | |||
| endif() | |||
| if (NOT DEFINED ARCH) | |||
| set(ARCH_IN "x86_64") | |||
| else() | |||
| set(ARCH_IN ${ARCH}) | |||
| endif() | |||
| if (${CORE} STREQUAL "generic") | |||
| set(ARCH_IN "GENERIC") | |||
| endif () | |||
| if (${CORE} STREQUAL "generic") | |||
| set(ARCH_IN "GENERIC") | |||
| endif () | |||
| if (NOT DEFINED EXPRECISION) | |||
| set(EXPRECISION_IN 0) | |||
| else() | |||
| set(EXPRECISION_IN ${EXPRECISION}) | |||
| endif() | |||
| if (NOT DEFINED EXPRECISION) | |||
| set(EXPRECISION_IN 0) | |||
| else() | |||
| set(EXPRECISION_IN ${EXPRECISION}) | |||
| endif() | |||
| if (NOT DEFINED NO_CBLAS) | |||
| set(NO_CBLAS_IN 0) | |||
| else() | |||
| set(NO_CBLAS_IN ${NO_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED NO_CBLAS) | |||
| set(NO_CBLAS_IN 0) | |||
| else() | |||
| set(NO_CBLAS_IN ${NO_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| set(NO_LAPACK_IN 0) | |||
| else() | |||
| set(NO_LAPACK_IN ${NO_LAPACK}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACK) | |||
| set(NO_LAPACK_IN 0) | |||
| else() | |||
| set(NO_LAPACK_IN ${NO_LAPACK}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACKE) | |||
| set(NO_LAPACKE_IN 0) | |||
| else() | |||
| set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
| endif() | |||
| if (NOT DEFINED NO_LAPACKE) | |||
| set(NO_LAPACKE_IN 0) | |||
| else() | |||
| set(NO_LAPACKE_IN ${NO_LAPACKE}) | |||
| endif() | |||
| if (NOT DEFINED NEED2UNDERSCORES) | |||
| set(NEED2UNDERSCORES_IN 0) | |||
| else() | |||
| set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
| endif() | |||
| if (NOT DEFINED NEED2UNDERSCORES) | |||
| set(NEED2UNDERSCORES_IN 0) | |||
| else() | |||
| set(NEED2UNDERSCORES_IN ${NEED2UNDERSCORES}) | |||
| endif() | |||
| if (NOT DEFINED ONLY_CBLAS) | |||
| set(ONLY_CBLAS_IN 0) | |||
| else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED ONLY_CBLAS) | |||
| set(ONLY_CBLAS_IN 0) | |||
| else() | |||
| set(ONLY_CBLAS_IN ${ONLY_CBLAS}) | |||
| endif() | |||
| if (NOT DEFINED BU) | |||
| set(BU _) | |||
| endif() | |||
| if (NOT DEFINED BU) | |||
| set(BU _) | |||
| endif() | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME} POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| message(STATUS "adding prefix ${SYMBOLPREFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| message(STATUS "adding suffix ${SYMBOLSUFFIX} to names of exported symbols in ${OpenBLAS_LIBNAME}") | |||
| endif() | |||
| add_custom_command(TARGET ${OpenBLAS_LIBNAME}_shared POST_BUILD | |||
| COMMAND perl ${PROJECT_SOURCE_DIR}/exports/gensymbol "objcopy" "${ARCH}" "${BU}" "${EXPRECISION_IN}" "${NO_CBLAS_IN}" "${NO_LAPACK_IN}" "${NO_LAPACKE_IN}" "${NEED2UNDERSCORES_IN}" "${ONLY_CBLAS_IN}" \"${SYMBOLPREFIX}\" \"${SYMBOLSUFFIX}\" "${BUILD_LAPACK_DEPRECATED}" > ${PROJECT_BINARY_DIR}/objcopy.def | |||
| COMMAND objcopy -v --redefine-syms ${PROJECT_BINARY_DIR}/objcopy.def ${PROJECT_BINARY_DIR}/lib/lib${OpenBLAS_LIBNAME}.so | |||
| COMMENT "renaming symbols" | |||
| ) | |||
| endif() | |||
| # Install project | |||
| # Install libraries | |||
| install(TARGETS ${OpenBLAS_LIBNAME} | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| if(BUILD_SHARED_LIBS AND BUILD_STATIC_LIBS) | |||
| install(TARGETS ${OpenBLAS_LIBNAME}_shared | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| install(TARGETS ${OpenBLAS_LIBNAME}_static | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| else() | |||
| install(TARGETS ${OpenBLAS_LIBS} | |||
| EXPORT "OpenBLAS${SUFFIX64}Targets" | |||
| RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} | |||
| ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} | |||
| LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ) | |||
| endif() | |||
| # Install headers | |||
| set(CMAKE_INSTALL_INCLUDEDIR ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| @@ -362,36 +456,41 @@ if(NOT NOFORTRAN) | |||
| endif() | |||
| if(NOT NO_CBLAS) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| message (STATUS "Generating cblas.h in ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| set(CBLAS_H ${CMAKE_BINARY_DIR}/generated/cblas.h) | |||
| file(READ ${CMAKE_CURRENT_SOURCE_DIR}/cblas.h CBLAS_H_CONTENTS) | |||
| string(REPLACE "common" "openblas_config" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| if (NOT ${SYMBOLPREFIX} STREQUAL "") | |||
| string(REPLACE " cblas" " ${SYMBOLPREFIX}cblas" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " openblas" " ${SYMBOLPREFIX}openblas" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string (REPLACE " ${SYMBOLPREFIX}openblas_complex" " openblas_complex" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REPLACE " goto" " ${SYMBOLPREFIX}goto" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| if (NOT ${SYMBOLSUFFIX} STREQUAL "") | |||
| string(REGEX REPLACE "(cblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(openblas[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| string(REGEX REPLACE "(openblas_complex[^ ]*)${SYMBOLSUFFIX}" "\\1" CBLAS_H_CONTENTS "${CBLAS_H_CONTENTS_NEW}") | |||
| string(REGEX REPLACE "(goto[^ (]*)" "\\1${SYMBOLSUFFIX}" CBLAS_H_CONTENTS_NEW "${CBLAS_H_CONTENTS}") | |||
| endif() | |||
| file(WRITE ${CBLAS_H} "${CBLAS_H_CONTENTS_NEW}") | |||
| install (FILES ${CBLAS_H} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| endif() | |||
| if(NOT NO_LAPACKE) | |||
| message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| add_dependencies( ${OpenBLAS_LIBNAME} genlapacke) | |||
| FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
| install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| ADD_CUSTOM_TARGET(genlapacke | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
| ) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| message (STATUS "Copying LAPACKE header files to ${CMAKE_INSTALL_INCLUDEDIR}") | |||
| if(BUILD_STATIC_LIBS) | |||
| add_dependencies( ${OpenBLAS_LIBNAME}_static genlapacke) | |||
| endif() | |||
| if(BUILD_SHARED_LIBS) | |||
| add_dependencies( ${OpenBLAS_LIBNAME}_shared genlapacke) | |||
| endif() | |||
| FILE(GLOB_RECURSE INCLUDE_FILES "${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/*.h") | |||
| install (FILES ${INCLUDE_FILES} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) | |||
| ADD_CUSTOM_TARGET(genlapacke | |||
| COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_SOURCE_DIR}/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h.in "${CMAKE_BINARY_DIR}/lapacke_mangling.h" | |||
| ) | |||
| install (FILES ${CMAKE_BINARY_DIR}/lapacke_mangling.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/openblas${SUFFIX64}) | |||
| endif() | |||
| # Install pkg-config files | |||
| @@ -416,4 +515,3 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${PN}ConfigVersion.cmake | |||
| install(EXPORT "${PN}${SUFFIX64}Targets" | |||
| NAMESPACE "${PN}${SUFFIX64}::" | |||
| DESTINATION ${CMAKECONFIG_INSTALL_DIR}) | |||
| @@ -194,3 +194,16 @@ In chronological order: | |||
| * PingTouGe Semiconductor Co., Ltd. | |||
| * [2020-10] Add RISC-V Vector (0.7.1) support. Optimize BLAS kernels for Xuantie C910 | |||
| * River Dillon <oss@outerpassage.net> | |||
| * [2021-07-10] fix compilation with musl libc | |||
| * Bine Brank <https://github.com/binebrank> | |||
| * [2021-10-27] Add vector-length-agnostic DGEMM kernels for Arm SVE | |||
| * [2021-11-20] Vector-length-agnostic Arm SVE copy routines for DGEMM, DTRMM, DSYMM | |||
| * [2021-11-12] SVE kernels for SGEMM, STRMM and corresponding SVE copy functions | |||
| * [2022-01-06] SVE kernels for CGEMM, ZGEMM, CTRMM, ZTRMM and corresponding SVE copy functions | |||
| * [2022-01-18] SVE kernels and copy functions for TRSM | |||
| * Ilya Kurdyukov <https://github.com/ilyakurdyukov> | |||
| * [2021-02-21] Add basic support for the Elbrus E2000 architecture | |||
| @@ -1,4 +1,340 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.20 | |||
| 20-Feb-2022 | |||
| general: | |||
| - some code cleanup, with added casts etc. | |||
| - fixed obtaining the cpu count with OpenMP and OMP_PROC_BIND unset | |||
| - fixed pivot index calculation by ?LASWP for negative increments other than one | |||
| - fixed input argument check in LAPACK ? GEQRT2 | |||
| - improved the check for a Fortran compiler in CMAKE builds | |||
| - disabled building OpenBLAS' optimized versions of LAPACK complex SPMV,SPR,SYMV,SYR with NO_LAPACK=1 | |||
| - fixed building of LAPACK on certain distributed filesystems with parallel gmake | |||
| - fixed building the shared library on MacOS with classic flang | |||
| x86_64: | |||
| - fixed cross-compilation with CMAKE for CORE2 target | |||
| - fixed miscompilation of AVX512 code in DYNAMIC_ARCH builds | |||
| - added support for the "incidental" AVX512 hardware in Alder Lake when enabled in BIOS | |||
| E2K: | |||
| - add new architecture (Russian Elbrus E2000 family) | |||
| SPARC: | |||
| - fix IMIN/IMAX | |||
| ARMV8: | |||
| - added SVE-enabled CGEMM and ZGEMM kernels for ARMV8SVE and A64FX | |||
| - added support for Neoverse N2 and V1 cpus | |||
| MIPS,MIPS64: | |||
| - fixed autodetection of MSA capability | |||
| LOONGARCH64: | |||
| - added an optimized DGEMM kernel | |||
| ==================================================================== | |||
| Version 0.3.19 | |||
| 19-Dec-2021 | |||
| general: | |||
| - reverted unsafe TRSV/ZRSV optimizations introduced in 0.3.16 | |||
| - fixed a potential thread race in the thread buffer reallocation routines | |||
| that were introduced in 0.3.18 | |||
| - fixed miscounting of thread pool size on Linux with OMP_PROC_BIND=TRUE | |||
| - fixed CBLAS interfaces for CSROT/ZSROT and CROTG/ZROTG | |||
| - made automatic library suffix for CMAKE builds with INTERFACE64 available | |||
| to CBLAS-only builds | |||
| x86_64: | |||
| - DYNAMIC_ARCH builds now fall back to the cpu with most similar capabilities | |||
| when an unknown CPUID is encountered, instead of defaulting to Prescott | |||
| - added cpu detection for Intel Alder Lake | |||
| - added cpu detection for Intel Sapphire Rapids | |||
| - added an optimized SBGEMM kernel for Sapphire Rapids | |||
| - fixed DYNAMIC_ARCH builds on OSX with CMAKE | |||
| - worked around DYNAMIC_ARCH builds made on Sandybridge failing on SkylakeX | |||
| - fixed missing thread initialization for static builds on Windows/MSVC | |||
| - fixed an excessive read in ZSYMV | |||
| POWER: | |||
| - added support for POWER10 in big-endian mode | |||
| - added support for building with CMAKE | |||
| - added optimized SGEMM and DGEMM kernels for small matrix sizes | |||
| ARMV8: | |||
| - added basic support and cputype detection for Fujitsu A64FX | |||
| - added a generic ARMV8SVE target | |||
| - added SVE-enabled SGEMM and DGEMM kernels for ARMV8SVE and A64FX | |||
| - added optimized CGEMM and ZGEMM kernels for Cortex A53 and A55 cpus | |||
| - fixed cpuid detection for Apple M1 and improved performance | |||
| - improved compiler flag setting in CMAKE builds | |||
| RISCV64: | |||
| - fixed improper initialization in CSCAL/ZSCAL for strided access patterns | |||
| MIPS: | |||
| - added a GENERIC target for MIPS32 | |||
| - added support for cross-compiling to MIPS32 on x86_64 using CMAKE | |||
| MIPS64: | |||
| - fixed misdetection of MSA capability | |||
| ==================================================================== | |||
| Version 0.3.18 | |||
| 02-Oct-2021 | |||
| general: | |||
| - when the build-time number of preconfigured threads is exceeded | |||
| at runtime (typically by an external program calling BLAS functions | |||
| from a larger number of threads in parallel), OpenBLAS will now | |||
| allocate an auxiliary control structure for up to 512 additional | |||
| threads instead of aborting | |||
| - added support for Loongson's LoongArch64 cpu architecture | |||
| - fixed building OpenBLAS with CMAKE and -DBUILD_BFLOAT16=ON | |||
| - added support for building OpenBLAS as a CMAKE subproject | |||
| - added support for building for Windows/ARM64 targets with clang | |||
| - improved support for building with the IBM xlf compiler | |||
| - imported Reference-LAPACK PR 625 (out-of-bounds reads in ?LARRV) | |||
| - imported Reference-LAPACK PR 597 for testsuite compatibility with | |||
| LLVM's libomp | |||
| x86_64: | |||
| - added SkylakeX S/DGEMM kernels for small problem sizes (M*N*K<=1000000) | |||
| - added optimized SBGEMM for Intel Cooper Lake | |||
| - reinstated the performance patch for AVX512 SGEMV_T with a proper fix | |||
| - added a workaround for a gcc11 tree-vectorizer bug that caused spurious | |||
| failures in the test programs for complex BLAS3 when compiling at -O3 | |||
| (the default for cmake "release" builds) | |||
| - added support for runtime cpu count detection under Haiku OS | |||
| - worked around a long-standing miscompilation issue of the Haswell DGEMV_T | |||
| kernel with gcc that could produce NaN output in some corner cases | |||
| POWER: | |||
| - improved performance of DASUM on POWER10 | |||
| ARMV8: | |||
| - fixed crashes (use of reserved register x18) on Apple M1 under OSX | |||
| - fixed building with gcc releases earlier than 5.1 | |||
| MIPS: | |||
| - fixed building under BSD | |||
| MIPS64: | |||
| - fixed building under BSD | |||
| ==================================================================== | |||
| Version 0.3.17 | |||
| 15-Jul-2021 | |||
| common: | |||
| - reverted the optimization of SGEMV_N/DGEMV_N for small input sizes | |||
| and consecutive arguments as it led to stack overflows on x86_64 | |||
| with some operating systems (notably OSX and Windows) | |||
| x86_64: | |||
| - reverted the performance patch for SGEMV_T on AVX512 as it caused | |||
| wrong results in some applications | |||
| SPARC: | |||
| - fixed compilation with compilers other than gcc | |||
| ==================================================================== | |||
| Version 0.3.16 | |||
| 11-Jul-2021 | |||
| common: | |||
| - drastically reduced the stack size requirements for running the LAPACK | |||
| testsuite (Reference-LAPACK PR 553) | |||
| - fixed spurious test failures in the LAPACK testsuite (Reference-LAPACK | |||
| PR 564) | |||
| - expressly setting DYNAMIC_ARCH=0 no longer enables dynamic_arch mode | |||
| - improved performance of xGER, xSPR, xSPR2, xSYR, xSYR2, xTRSV, SGEMV_N | |||
| and DGEMV_N, for small input sizes and consecutive arguments | |||
| - improved performance of xGETRF, xPORTF and xPOTRI for small input sizes | |||
| by disabling multithreading | |||
| - fixed installing with BSD versions of the "install" utility | |||
| RISCV: | |||
| - fixed the implementation of xIMIN | |||
| - improved the performance of DSDOT | |||
| - fixed linking of the tests on C910V with current vendor gcc | |||
| POWER: | |||
| - fixed SBGEMM computation for some odd value inputs | |||
| - fixed compilation for PPCG4, PPC970, POWER3, POWER4 and POWER5 | |||
| x86_64: | |||
| - improved performance of SGEMV_N and SGEMV_T for small N on AVX512-capable cpus | |||
| - worked around a miscompilation of ZGEMM/ZTRMM on Sandybridge with old gcc | |||
| versions | |||
| - fixed compilation with MS Visual Studio versions older than 2017 | |||
| - fixed macro name collision with winnt.h from the latest Win10 SDK | |||
| - added cpu type autodetection for Intel Ice Lake SP | |||
| - fixed cpu type autodetection for Intel Tiger Lake | |||
| - added cpu type autodetection for recent Centaur/Zhaoxin models | |||
| - fixed compilation with musl libc | |||
| ARM64: | |||
| - fixed compilation with gcc/gfortran on the Apple M1 | |||
| - fixed linking of the tests on FreeBSD | |||
| - fixed missing restore of a register in the recently rewritten DNRM2 kernel | |||
| for ThunderX2 and Neoverse N1 that could cause spurious failures in e.g. | |||
| DGEEV | |||
| - added compiler optimization flags for the EMAG8180 | |||
| - added initial support for Cortex A55 | |||
| ARM: | |||
| - fixed linking of the tests on FreeBSD | |||
| ==================================================================== | |||
| Version 0.3.15 | |||
| 2-May-2021 | |||
| common: | |||
| - imported improvements and bugfixes from Reference-LAPACK 3.9.1 | |||
| - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537 | |||
| - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation | |||
| - fixed a sequence problem in the generation of softlinks to the library in GMAKE | |||
| RISC V: | |||
| - fixed compilation on RISCV (missing entry in getarch) | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| POWER: | |||
| - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler | |||
| - improved CGEMM, DGEMM and ZGEMM performance on POWER10 | |||
| - added an optimized ZGEMV kernel for POWER10 | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| x86_64: | |||
| - added support for Intel Control-flow Enforcement Technology (CET) | |||
| - reverted the DOMATCOPY_RT code to the generic C version | |||
| - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14 | |||
| - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH | |||
| - added support for compilation of the benchmarks on older OSX versions | |||
| - fix propagation of the NO_AVX512 option in CMAKE builds | |||
| - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows | |||
| - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX) | |||
| - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512 | |||
| ARM: | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs | |||
| ARM64: | |||
| - fixed spurious reads outside the array in the SGEMM tcopy macro | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14) | |||
| MIPS | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs | |||
| MIPS64: | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| SPARC: | |||
| - fixed a potential division by zero in CROTG and ZROTG | |||
| ==================================================================== | |||
| Version 0.3.14 | |||
| 17-Mar-2021 | |||
| common: | |||
| * Fixed a race condition on thread shutdown in non-OpenMP builds | |||
| * Fixed custom BUFFERSIZE option getting ignored in gmake builds | |||
| * Fixed CMAKE compilation of the TRMM kernels for GENERIC platforms | |||
| * Added CBLAS interfaces for CROTG, ZROTG, CSROT and ZDROT | |||
| * Improved performance of OMATCOPY_RT across all platforms | |||
| * Changed perl scripts to use env instead of a hardcoded /usr/bin/perl | |||
| * Fixed potential misreading of the GCC compiler version in the build scripts | |||
| * Fixed convergence problems in LAPACK complex GGEV/GGES (Reference-LAPACK #477) | |||
| * Reduced the stacksize requirements for running the LAPACK testsuite (Reference-LAPACK #335) | |||
| RISCV: | |||
| * Fixed compilation on RISCV (missing entry in getarch) | |||
| POWER: | |||
| * Fixed compilation for DYNAMIC_ARCH with clang and with old gcc versions | |||
| * Added support for compilation on FreeBSD/ppc64le | |||
| * Added optimized POWER10 kernels for SSCAL, DSCAL, CSCAL, ZSCAL | |||
| * Added optimized POWER10 kernels for SROT, DROT, CDOT, SASUM, DASUM | |||
| * Improved SSWAP, DSWAP, CSWAP, ZSWAP performance on POWER10 | |||
| * Improved SCOPY and CCOPY performance on POWER10 | |||
| * Improved SGEMM and DGEMM performance on POWER10 | |||
| * Added support for compilation with the NVIDIA HPC compiler | |||
| x86_64: | |||
| * Added an optimized bfloat16 GEMM kernel for Cooperlake | |||
| * Added CPUID autodetection for Intel Rocket Lake and Tiger Lake cpus | |||
| * Improved the performance of SASUM,DASUM,SROT,DROT on AMD Ryzen cpus | |||
| * Added support for compilation with the NAG Fortran compiler | |||
| * Fixed recognition of the AMD AOCC compiler | |||
| * Fixed compilation for DYNAMIC_ARCH with clang on Windows | |||
| * Added support for running the BLAS/CBLAS tests on Windows | |||
| * Fixed signatures of the tls callback functions for Windows x64 | |||
| * Fixed various issues with fma intrinsics support handling | |||
| ARM: | |||
| * Added support for embedded Cortex M targets via a new option EMBEDDED | |||
| ARMV8: | |||
| * Fixed the THUNDERX2T99 and NEOVERSEN1 DNRM2/ZNRM2 kernels for inputs with Inf | |||
| * Added support for the DYNAMIC_LIST option | |||
| * Added support for compilation with the NVIDIA HPC compiler | |||
| * Added support for compiling with the NAG Fortran compiler | |||
| ==================================================================== | |||
| Version 0.3.13 | |||
| 12-Dec-2020 | |||
| common: | |||
| * Added a generic bfloat16 SBGEMV kernel | |||
| * Fixed a potentially severe memory leak after fork in OpenMP builds | |||
| that was introduced in 0.3.12 | |||
| * Added detection of the Fujitsu Fortran compiler | |||
| * Added detection of the (e)gfortran compiler on OpenBSD | |||
| * Added support for overriding the default name of the library independently | |||
| from symbol suffixing in the gmake builds (already supported in cmake) | |||
| RISCV: | |||
| * Added a RISC V port optimized for C910V | |||
| POWER: | |||
| * Added optimized POWER10 kernels for SAXPY, CAXPY, SDOT, DDOT and DGEMV_N | |||
| * Improved DGEMM performance on POWER10 | |||
| * Improved STRSM and DTRSM performance on POWER9 and POWER10 | |||
| * Fixed segmemtation faults in DYNAMIC_ARCH builds | |||
| * Fixed compilation with the PGI compiler | |||
| x86: | |||
| * Fixed compilation of kernels that require SSE2 intrinsics since 0.3.12 | |||
| x86_64: | |||
| * Added an optimized bfloat16 SBGEMV kernel for SkylakeX and Cooperlake | |||
| * Improved the performance of SASUM and DASUM kernels through parallelization | |||
| * Improved the performance of SROT and DROT kernels | |||
| * Improved the performance of multithreaded xSYRK | |||
| * Fixed OpenMP builds that use the LLVM Clang compiler together with GNU gfortran | |||
| (where linking of both the LLVM libomp and GNU libgomp could lead to lockups or | |||
| wrong results) | |||
| * Fixed miscompilations by old gcc 4.6 | |||
| * Fixed misdetection of AVX2 capability in some Sandybridge cpus | |||
| * Fixed lockups in builds combining DYNAMIC_ARCH with TARGET=GENERIC on OpenBSD | |||
| ARM64: | |||
| * Fixed segmemtation faults in DYNAMIC_ARCH builds | |||
| MIPS: | |||
| * Improved kernels for Loongson 3R3 ("3A") and 3R4 ("3B") models, including MSA | |||
| * Fixed bugs in the MSA kernels for CGEMM, CTRMM, CGEMV and ZGEMV | |||
| * Added handling of zero increments in the MSA kernels for SSWAP and DSWAP | |||
| * Added DYNAMIC_ARCH support for MIPS64 (currently Loongson3R3/3R4 only) | |||
| SPARC: | |||
| * Fixed building 32 and 64 bit SPARC kernels with the SolarisStudio compilers | |||
| ==================================================================== | |||
| Version 0.3.12 | |||
| 24-Oct-2020 | |||
| @@ -32,7 +32,7 @@ export NOFORTRAN | |||
| export NO_LAPACK | |||
| endif | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast,$(LAPACK_FFLAGS)) | |||
| LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) | |||
| SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test | |||
| @@ -59,6 +59,9 @@ endif | |||
| @$(CC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${cverinfo}" ]; then \ | |||
| cverinfo=`$(CC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " C compiler ... $(C_COMPILER) (cmd & version : $${cverinfo})";\ | |||
| else \ | |||
| echo " C compiler ... $(C_COMPILER) (command line : $(CC))";\ | |||
| @@ -67,6 +70,9 @@ ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| @$(FC) --version > /dev/null 2>&1;\ | |||
| if [ $$? -eq 0 ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '1p'`; \ | |||
| if [ -z "$${fverinfo}" ]; then \ | |||
| fverinfo=`$(FC) --version | sed -n '2p'`; \ | |||
| fi; \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (cmd & version : $${fverinfo})";\ | |||
| else \ | |||
| echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))";\ | |||
| @@ -161,7 +167,6 @@ ifeq ($(NO_SHARED), 1) | |||
| $(error OpenBLAS: neither static nor shared are enabled.) | |||
| endif | |||
| endif | |||
| @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| @for d in $(SUBDIRS) ; \ | |||
| do if test -d $$d; then \ | |||
| $(MAKE) -C $$d $(@F) || exit 1 ; \ | |||
| @@ -190,6 +195,7 @@ endif | |||
| ifdef USE_THREAD | |||
| @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last | |||
| endif | |||
| @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| @touch lib.grd | |||
| prof : prof_blas prof_lapack | |||
| @@ -263,7 +269,7 @@ prof_lapack : lapack_prebuild | |||
| lapack_prebuild : | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "override FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -1,80 +1,234 @@ | |||
| ifneq ($(C_COMPILER), PGI) | |||
| ifeq ($(C_COMPILER), CLANG) | |||
| ISCLANG=1 | |||
| endif | |||
| ifneq (1, $(filter 1,$(GCCVERSIONGT4) $(ISCLANG))) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| else | |||
| ifeq ($(CORE), ARMV8) | |||
| CCOMMON_OPT += -march=armv8-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), ARMV8SVE) | |||
| CCOMMON_OPT += -march=armv8-a+sve | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a+sve | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA53) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA57) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a57 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA72) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), CORTEXA73) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a73 | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N1 is only available | |||
| # in GCC>=9 | |||
| ifeq ($(CORE), NEOVERSEN1) | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=neoverse-n1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-V1 is only available | |||
| # in GCC>=9.4 | |||
| ifeq ($(CORE), NEOVERSEV1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.4-a -mtune=native | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| # Use a72 tunings because Neoverse-N2 is only available | |||
| # in GCC>=9.4 | |||
| ifeq ($(CORE), NEOVERSEN2) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ10))) | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=neoverse-n2 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.5-a -mtune=native | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72 | |||
| endif | |||
| endif | |||
| endif | |||
| # Use a53 tunings because a55 is only available in GCC>=8.1 | |||
| ifeq ($(CORE), CORTEXA55) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ7) $(ISCLANG))) | |||
| ifeq ($(GCCVERSIONGTEQ8), 1) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a55 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=cortex-a53 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=thunderx | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), FALKOR) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=falkor | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX2T99) | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), THUNDERX3T110) | |||
| ifeq ($(GCCVERSIONGTEQ10), 1) | |||
| CCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.3-a -mtune=thunderx3t110 | |||
| endif | |||
| else | |||
| CCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.1-a -mtune=thunderx2t99 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), VORTEX) | |||
| CCOMMON_OPT += -march=armv8.3-a | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.3-a | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ9) $(ISCLANG))) | |||
| ifeq ($(CORE), TSV110) | |||
| CCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a -mtune=tsv110 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ9), 1) | |||
| ifeq ($(CORE), EMAG8180) | |||
| CCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8-a -mtune=emag | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq (1, $(filter 1,$(GCCVERSIONGTEQ11) $(ISCLANG))) | |||
| ifeq ($(CORE), A64FX) | |||
| CCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=armv8.2-a+sve -mtune=a64fx | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -0,0 +1 @@ | |||
| COPT = -Wall -O2 # -DGEMMTEST | |||
| @@ -74,17 +74,17 @@ endif | |||
| ifneq ($(OSNAME), AIX) | |||
| ifndef NO_LAPACKE | |||
| @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -pm644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapack.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapack.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_config.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_mangling_with_flags.h.in "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h" | |||
| @-install -m644 $(NETLIB_LAPACK_DIR)/LAPACKE/include/lapacke_utils.h "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h" | |||
| endif | |||
| #for install static library | |||
| ifneq ($(NO_STATIC),1) | |||
| @echo Copying the static library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| @install -pm644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @install -m644 $(LIBNAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) | |||
| endif | |||
| @@ -92,7 +92,7 @@ endif | |||
| ifneq ($(NO_SHARED),1) | |||
| @echo Copying the shared library to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) | |||
| ifeq ($(OSNAME), $(filter $(OSNAME),Linux SunOS Android Haiku FreeBSD DragonFly)) | |||
| @install -pm755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @install -m755 $(LIBSONAME) "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" | |||
| @cd "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so ; \ | |||
| ln -fs $(LIBSONAME) $(LIBPREFIX).so.$(MAJOR_VERSION) | |||
| @@ -0,0 +1,3 @@ | |||
| ifdef BINARY64 | |||
| else | |||
| endif | |||
| @@ -10,9 +10,15 @@ USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| CCOMMON_OPT += -Ofast -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power10 -mtune=power10 -fno-fast-math | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| @@ -31,7 +37,11 @@ else | |||
| CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -fno-fast-math | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| ifneq ($(GCCVERSIONGT4), 1) | |||
| $(warning your compiler is too old to fully support POWER9, getting a newer version of gcc is recommended) | |||
| @@ -55,7 +65,11 @@ CCOMMON_OPT += -fast -Mvect=simd -Mcache_align | |||
| endif | |||
| ifneq ($(F_COMPILER), PGI) | |||
| ifeq ($(OSNAME), AIX) | |||
| ifeq ($(F_COMPILER), IBM) | |||
| FCOMMON_OPT += -O2 -qrecur -qnosave | |||
| else | |||
| FCOMMON_OPT += -O1 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=power8 -mtune=power8 -fno-fast-math | |||
| endif | |||
| @@ -3,6 +3,10 @@ | |||
| export BINARY | |||
| export USE_OPENMP | |||
| ifdef DYNAMIC_ARCH | |||
| override HOST_CFLAGS += -DDYNAMIC_ARCH | |||
| endif | |||
| ifdef TARGET_CORE | |||
| TARGET_MAKE = Makefile_kernel.conf | |||
| TARGET_CONF = config_kernel.h | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.12.dev | |||
| VERSION = 0.3.20 | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -9,11 +9,10 @@ ifndef TOPDIR | |||
| TOPDIR = . | |||
| endif | |||
| # If ARCH is not set, we use the host system's architecture for getarch compile options. | |||
| ifndef ARCH | |||
| # we need to use the host system's architecture for getarch compile options even especially when cross-compiling | |||
| HOSTARCH := $(shell uname -m) | |||
| else | |||
| HOSTARCH = $(ARCH) | |||
| ifeq ($(HOSTARCH), amd64) | |||
| HOSTARCH=x86_64 | |||
| endif | |||
| # Catch conflicting usage of ARCH in some BSD environments | |||
| @@ -21,6 +20,8 @@ ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| else ifeq ($(ARCH), powerpc64) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc64le) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), i386) | |||
| @@ -31,6 +32,10 @@ else ifeq ($(ARCH), armv7) | |||
| override ARCH=arm | |||
| else ifeq ($(ARCH), aarch64) | |||
| override ARCH=arm64 | |||
| else ifeq ($(ARCH), mipsel) | |||
| override ARCH=mips | |||
| else ifeq ($(ARCH), mips64el) | |||
| override ARCH=mips64 | |||
| else ifeq ($(ARCH), zarch) | |||
| override ARCH=zarch | |||
| endif | |||
| @@ -96,7 +101,7 @@ GETARCH_FLAGS += -DUSER_TARGET | |||
| ifeq ($(TARGET), GENERIC) | |||
| ifeq ($(DYNAMIC_ARCH), 1) | |||
| override NO_EXPRECISION=1 | |||
| export NO_EXPRECiSION | |||
| export NO_EXPRECISION | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -113,6 +118,9 @@ endif | |||
| ifeq ($(TARGET), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SAPPHIRERAPIDS) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -137,8 +145,13 @@ endif | |||
| ifeq ($(TARGET), POWER8) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| ifeq ($(TARGET), POWER9) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| ifeq ($(TARGET), POWER10) | |||
| GETARCH_FLAGS := -DFORCE_POWER6 | |||
| endif | |||
| endif | |||
| #TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1. | |||
| # | |||
| @@ -158,6 +171,9 @@ endif | |||
| ifeq ($(TARGET_CORE), COOPERLAKE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| ifeq ($(TARGET_CORE), SANDYBRIDGE) | |||
| GETARCH_FLAGS := -DFORCE_NEHALEM | |||
| endif | |||
| @@ -181,7 +197,7 @@ endif | |||
| # On x86_64 build getarch with march=native unless the compiler is PGI. This is required to detect AVX512 support in getarch. | |||
| ifeq ($(HOSTARCH), x86_64) | |||
| ifeq ($(findstring pgcc,$(HOSTCC)),) | |||
| ifeq ($(findstring pgcc,$(HOSTCC))$(findstring nvc,$(HOSTCC)),) | |||
| GETARCH_FLAGS += -march=native | |||
| endif | |||
| endif | |||
| @@ -242,12 +258,26 @@ else | |||
| ONLY_CBLAS = 0 | |||
| endif | |||
| #For small matrix optimization | |||
| ifeq ($(ARCH), x86_64) | |||
| SMALL_MATRIX_OPT = 1 | |||
| else ifeq ($(CORE), POWER10) | |||
| SMALL_MATRIX_OPT = 1 | |||
| endif | |||
| ifeq ($(SMALL_MATRIX_OPT), 1) | |||
| CCOMMON_OPT += -DSMALL_MATRIX_OPT | |||
| endif | |||
| # This operation is expensive, so execution should be once. | |||
| ifndef GOTOBLAS_MAKEFILE | |||
| export GOTOBLAS_MAKEFILE = 1 | |||
| # Determine if the assembler is GNU Assembler | |||
| HAVE_GAS := $(shell $(AS) -v < /dev/null 2>&1 | grep GNU 2>&1 >/dev/null ; echo $$?) | |||
| GETARCH_FLAGS += -DHAVE_GAS=$(HAVE_GAS) | |||
| # Generating Makefile.conf and config.h | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" HOST_CFLAGS="$(GETARCH_FLAGS)" CFLAGS="$(CFLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) DYNAMIC_ARCH=$(DYNAMIC_ARCH) TARGET_CORE=$(TARGET_CORE) ONLY_CBLAS=$(ONLY_CBLAS) TARGET=$(TARGET) all) | |||
| ifndef TARGET_CORE | |||
| include $(TOPDIR)/Makefile.conf | |||
| @@ -293,7 +323,7 @@ else | |||
| SMP = 1 | |||
| endif | |||
| else | |||
| ifeq ($(NUM_THREAD), 1) | |||
| ifeq ($(NUM_THREADS), 1) | |||
| SMP = | |||
| else | |||
| SMP = 1 | |||
| @@ -331,6 +361,7 @@ GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ8 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 8) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) | |||
| @@ -343,6 +374,7 @@ else | |||
| endif | |||
| GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) | |||
| GCCMINORVERSIONGTEQ4 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 4) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) | |||
| endif | |||
| @@ -378,6 +410,12 @@ ifeq ($(OSNAME), AIX) | |||
| EXTRALIB += -lm | |||
| endif | |||
| ifeq ($(OSNAME), FreeBSD) | |||
| ifeq ($(ARCH), $(filter $(ARCH),arm arm64)) | |||
| EXTRALIB += -lm | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| NEED_PIC = 0 | |||
| NO_EXPRECISION = 1 | |||
| @@ -617,12 +655,24 @@ DYNAMIC_CORE += CORTEXA57 | |||
| DYNAMIC_CORE += CORTEXA72 | |||
| DYNAMIC_CORE += CORTEXA73 | |||
| DYNAMIC_CORE += NEOVERSEN1 | |||
| DYNAMIC_CORE += NEOVERSEV1 | |||
| DYNAMIC_CORE += NEOVERSEN2 | |||
| DYNAMIC_CORE += CORTEXA55 | |||
| DYNAMIC_CORE += FALKOR | |||
| DYNAMIC_CORE += THUNDERX | |||
| DYNAMIC_CORE += THUNDERX2T99 | |||
| DYNAMIC_CORE += TSV110 | |||
| DYNAMIC_CORE += EMAG8180 | |||
| DYNAMIC_CORE += THUNDERX3T110 | |||
| ifdef DYNAMIC_LIST | |||
| override DYNAMIC_CORE = ARMV8 $(DYNAMIC_LIST) | |||
| XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_ARMV8 | |||
| XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore)) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), mips64) | |||
| DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| @@ -659,6 +709,7 @@ endif | |||
| endif # ARCH zarch | |||
| ifeq ($(ARCH), power) | |||
| ifneq ($(C_COMPILER), PGI) | |||
| DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| @@ -672,7 +723,7 @@ DYNAMIC_CORE += POWER9 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) | |||
| LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) | |||
| ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) | |||
| DYNAMIC_CORE += POWER10 | |||
| CCOMMON_OPT += -DHAVE_P10_SUPPORT | |||
| @@ -685,6 +736,10 @@ else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| else | |||
| DYNAMIC_CORE = POWER8 | |||
| DYNAMIC_CORE += POWER9 | |||
| endif | |||
| endif | |||
| # If DYNAMIC_CORE is not set, DYNAMIC_ARCH cannot do anything, so force it to empty | |||
| @@ -756,6 +811,11 @@ NO_BINARY_MODE = 1 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| NO_BINARY_MODE = 1 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| # | |||
| # C Compiler dependent settings | |||
| @@ -787,14 +847,9 @@ CCOMMON_OPT += -mabi=32 | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) | |||
| CCOMMON_OPT += -march=loongson3a | |||
| FCOMMON_OPT += -march=loongson3a | |||
| endif | |||
| ifeq ($(CORE), MIPS24K) | |||
| @@ -831,6 +886,13 @@ ifeq ($(OSNAME), AIX) | |||
| BINARY_DEFINED = 1 | |||
| endif | |||
| ifeq ($(ARCH), loongarch64) | |||
| ifeq ($(CORE), LOONGSON3R5) | |||
| CCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| FCOMMON_OPT += -march=loongarch64 -mabi=lp64 | |||
| endif | |||
| endif | |||
| endif | |||
| ifndef BINARY_DEFINED | |||
| @@ -848,9 +910,29 @@ endif | |||
| endif | |||
| ifeq ($(C_COMPILER), PGI) | |||
| PGCVERSIONGT20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 20) | |||
| PGCVERSIONEQ20 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 20) | |||
| PGCMINORVERSIONGE11 := $(shell expr `$(CC) --version|sed -n "2p" |cut -d "-" -f 1 |sed -e "s/[^0-9.]//g" |cut -c 4-5` \>= 11) | |||
| PGCVERSIONCHECK := $(PGCVERSIONGT20)$(PGCVERSIONEQ20)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK), $(filter $(PGCVERSIONCHECK), 100 101 011)) | |||
| NEWPGI := 1 | |||
| PGCVERSIONGT21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` \> 21) | |||
| PGCVERSIONEQ21 := $(shell expr `$(CC) --version|sed -n "2p" |sed -e "s/[^0-9.]//g" |cut -d "." -f 1` == 21) | |||
| PGCVERSIONCHECK2 := $(PGCVERSIONGT21)$(PGCVERSIONEQ21)$(PGCMINORVERSIONGE11) | |||
| ifeq ($(PGCVERSIONCHECK2), $(filter $(PGCVERSIONCHECK2), 100 101 011)) | |||
| NEWPGI2 := 1 | |||
| endif | |||
| endif | |||
| ifdef BINARY64 | |||
| ifeq ($(ARCH), x86_64) | |||
| CCOMMON_OPT += -tp p7-64 -D__MMX__ -Mnollvm | |||
| ifneq ($(NEWPGI2),1) | |||
| CCOMMON_OPT += -tp p7-64 | |||
| else | |||
| CCOMMON_OPT += -tp px | |||
| endif | |||
| ifneq ($(NEWPGI),1) | |||
| CCOMMON_OPT += -D__MMX__ -Mnollvm | |||
| endif | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER8) | |||
| @@ -862,7 +944,11 @@ endif | |||
| endif | |||
| endif | |||
| else | |||
| ifneq ($(NEWPGI2),1) | |||
| CCOMMON_OPT += -tp p7 | |||
| else | |||
| CCOMMON_OPT += -tp px | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -878,13 +964,25 @@ endif | |||
| # Fortran Compiler dependent settings | |||
| # | |||
| ifeq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -dcfuns -recursive -ieee=full -w=obs -thread_safe | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -openmp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(ARCH), x86_64) | |||
| FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) | |||
| ifeq ($(FLANG_VENDOR),AOCC) | |||
| FLANG_VENDOR := $(shell $(FC) --version|head -1 |cut -f 1 -d " ") | |||
| ifeq ($(FLANG_VENDOR), AMD) | |||
| FCOMMON_OPT += -fno-unroll-loops | |||
| endif | |||
| endif | |||
| @@ -1027,21 +1125,31 @@ FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), x86_64) | |||
| ifneq ($(NEWPGI2),1) | |||
| FCOMMON_OPT += -tp p7-64 | |||
| else | |||
| FCOMMON_OPT += -tp px | |||
| endif | |||
| else | |||
| ifeq ($(ARCH), power) | |||
| ifeq ($(CORE), POWER6) | |||
| $(warning NVIDIA HPC compilers do not support POWER6.) | |||
| endif | |||
| ifeq ($(CORE), POWER8) | |||
| FCOMMON_OPT += -tp pwr8 | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| FCOMMON_OPT += -tp pwr9 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| $(warning NVIDIA HPC compilers do not support POWER10.) | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| FCOMMON_OPT += -tp p7 | |||
| endif | |||
| FCOMMON_OPT += -Mrecursive | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -mp | |||
| endif | |||
| @@ -1078,11 +1186,11 @@ FCOMMON_OPT += -n32 | |||
| else | |||
| FCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| FCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| @@ -1108,11 +1216,11 @@ CCOMMON_OPT += -n32 | |||
| else | |||
| CCOMMON_OPT += -n64 | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3A) | |||
| ifeq ($(CORE), LOONGSON3R3) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| ifeq ($(CORE), LOONGSON3B) | |||
| ifeq ($(CORE), LOONGSON3R4) | |||
| CCOMMON_OPT += -loongson3 -static | |||
| endif | |||
| @@ -1180,6 +1288,8 @@ CCOMMON_OPT += -fPIC | |||
| endif | |||
| ifeq ($(F_COMPILER), SUN) | |||
| FCOMMON_OPT += -pic | |||
| else ifeq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -PIC | |||
| else | |||
| FCOMMON_OPT += -fPIC | |||
| endif | |||
| @@ -1223,10 +1333,8 @@ ifdef SMP | |||
| CCOMMON_OPT += -DSMP_SERVER | |||
| ifeq ($(ARCH), mips64) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| endif | |||
| endif | |||
| ifeq ($(USE_OPENMP), 1) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| @@ -1259,6 +1367,10 @@ CCOMMON_OPT += -DUSE_PAPI | |||
| EXTRALIB += -lpapi -lperfctr | |||
| endif | |||
| ifdef BUFFERSIZE | |||
| CCOMMON_OPT += -DBUFFERSIZE=$(BUFFERSIZE) | |||
| endif | |||
| ifdef DYNAMIC_THREADS | |||
| CCOMMON_OPT += -DDYNAMIC_THREADS | |||
| endif | |||
| @@ -1342,11 +1454,9 @@ endif | |||
| ifneq ($(ARCH), x86_64) | |||
| ifneq ($(ARCH), x86) | |||
| ifneq ($(CORE), LOONGSON3B) | |||
| NO_AFFINITY = 1 | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef NO_AFFINITY | |||
| ifeq ($(NO_AFFINITY), 0) | |||
| @@ -1438,6 +1548,10 @@ LAPACK_FFLAGS := $(FFLAGS) | |||
| LAPACK_FPFLAGS := $(FPFLAGS) | |||
| endif | |||
| ifeq ($(F_COMPILER),NAG) | |||
| LAPACK_FFLAGS := $(filter-out -msse3 -mssse3 -msse4.1 -mavx -mavx2 -mskylake-avx512 ,$(FFLAGS)) | |||
| endif | |||
| LAPACK_CFLAGS = $(CFLAGS) | |||
| LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H | |||
| ifdef INTERFACE64 | |||
| @@ -1566,8 +1680,10 @@ export HAVE_VFP | |||
| export HAVE_VFPV3 | |||
| export HAVE_VFPV4 | |||
| export HAVE_NEON | |||
| export HAVE_MSA | |||
| export MSA_FLAGS | |||
| ifndef NO_MSA | |||
| export HAVE_MSA | |||
| export MSA_FLAGS | |||
| endif | |||
| export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| @@ -1,10 +1,21 @@ | |||
| # COMPILER_PREFIX = mingw32- | |||
| ifneq ($(DYNAMIC_ARCH),1) | |||
| ADD_CPUFLAGS = 1 | |||
| else | |||
| ifdef TARGET_CORE | |||
| ADD_CPUFLAGS = 1 | |||
| endif | |||
| endif | |||
| ifdef ADD_CPUFLAGS | |||
| ifdef HAVE_SSE | |||
| CCOMMON_OPT += -msse | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -msse | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| ARFLAGS = -m x86 | |||
| @@ -8,42 +8,57 @@ endif | |||
| endif | |||
| endif | |||
| ifneq ($(DYNAMIC_ARCH),1) | |||
| ADD_CPUFLAGS = 1 | |||
| else | |||
| ifdef TARGET_CORE | |||
| ADD_CPUFLAGS = 1 | |||
| endif | |||
| endif | |||
| ifdef ADD_CPUFLAGS | |||
| ifdef HAVE_SSE3 | |||
| CCOMMON_OPT += -msse3 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -msse3 | |||
| endif | |||
| endif | |||
| ifdef HAVE_SSSE3 | |||
| CCOMMON_OPT += -mssse3 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mssse3 | |||
| endif | |||
| endif | |||
| ifdef HAVE_SSE4_1 | |||
| CCOMMON_OPT += -msse4.1 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -msse4.1 | |||
| endif | |||
| endif | |||
| ifndef OLDGCC | |||
| ifdef HAVE_AVX | |||
| CCOMMON_OPT += -mavx | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mavx | |||
| endif | |||
| endif | |||
| endif | |||
| ifndef NO_AVX2 | |||
| ifdef HAVE_AVX2 | |||
| CCOMMON_OPT += -mavx2 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -mavx2 | |||
| endif | |||
| endif | |||
| ifndef OLDGCC | |||
| ifdef HAVE_FMA3 | |||
| CCOMMON_OPT += -mfma | |||
| FCOMMON_OPT += -mfma | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SKYLAKEX) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -56,17 +71,22 @@ endif | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), COOPERLAKE) | |||
| ifndef DYNAMIC_ARCH | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # cooperlake support was added in 10.1 | |||
| ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) | |||
| CCOMMON_OPT += -march=cooperlake | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=cooperlake | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| @@ -80,6 +100,34 @@ endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), SAPPHIRERAPIDS) | |||
| ifndef NO_AVX512 | |||
| ifeq ($(C_COMPILER), GCC) | |||
| # sapphire rapids support was added in 11 | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| CCOMMON_OPT += -march=sapphirerapids | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=sapphirerapids | |||
| endif | |||
| else # gcc not support, fallback to avx512 | |||
| CCOMMON_OPT += -march=skylake-avx512 | |||
| ifneq ($(F_COMPILER), NAG) | |||
| FCOMMON_OPT += -march=skylake-avx512 | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), CYGWIN_NT) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(C_COMPILER), GCC) | |||
| CCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| FCOMMON_OPT += -fno-asynchronous-unwind-tables | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef HAVE_AVX2 | |||
| @@ -112,6 +160,7 @@ endif | |||
| endif | |||
| endif | |||
| endif | |||
| ifeq ($(OSNAME), Interix) | |||
| @@ -2,7 +2,7 @@ | |||
| [](https://gitter.im/xianyi/OpenBLAS?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) | |||
| Travis CI: [](https://travis-ci.org/xianyi/OpenBLAS) | |||
| Travis CI: [](https://travis-ci.com/xianyi/OpenBLAS) | |||
| AppVeyor: [](https://ci.appveyor.com/project/xianyi/openblas/branch/develop) | |||
| @@ -13,17 +13,21 @@ Drone CI: [ library based on GotoBLAS2 1.13 BSD version. | |||
| Please read the documentation on the OpenBLAS wiki pages: <https://github.com/xianyi/OpenBLAS/wiki>. | |||
| For a general introduction to the BLAS routines, please refer to the extensive documentation of their reference implementation hosted at netlib: | |||
| <https://www.netlib.org/blas>. On that site you will likewise find documentation for the reference implementation of the higher-level library LAPACK - the **L**inear **A**lgebra **Pack**age that comes included with OpenBLAS. If you are looking for a general primer or refresher on Linear Algebra, the set of six | |||
| 20-minute lecture videos by Prof. Gilbert Strang on either MIT OpenCourseWare <https://ocw.mit.edu/resources/res-18-010-a-2020-vision-of-linear-algebra-spring-2020/> or Youtube <https://www.youtube.com/playlist?list=PLUl4u3cNGP61iQEFiWLE21EJCxwmWvvek> may be helpful. | |||
| ## Binary Packages | |||
| We provide official binary packages for the following platform: | |||
| * Windows x86/x86_64 | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/). | |||
| You can download them from [file hosting on sourceforge.net](https://sourceforge.net/projects/openblas/files/) or from the Releases section of the github project page, [https://github.com/xianyi/OpenBLAS/releases](https://github.com/xianyi/OpenBLAS/releases). | |||
| ## Installation from Source | |||
| @@ -124,6 +128,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **Intel Sandy Bridge**: Optimized Level-3 and Level-2 BLAS with AVX on x86-64. | |||
| - **Intel Haswell**: Optimized Level-3 and Level-2 BLAS with AVX2 and FMA on x86-64. | |||
| - **Intel Skylake-X**: Optimized Level-3 and Level-2 BLAS with AVX512 and FMA on x86-64. | |||
| - **Intel Cooper Lake**: as Skylake-X with improved BFLOAT16 support. | |||
| - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. | |||
| - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thanks to Werner Saar) | |||
| - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. | |||
| @@ -149,6 +154,7 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **ARMv8**: Basic ARMV8 with small caches, optimized Level-3 and Level-2 BLAS | |||
| - **Cortex-A53**: same as ARMV8 (different cpu specifications) | |||
| - **Cortex-A55**: same as ARMV8 (different cpu specifications) | |||
| - **Cortex A57**: Optimized Level-3 and Level-2 functions | |||
| - **Cortex A72**: same as A57 ( different cpu specifications) | |||
| - **Cortex A73**: same as A57 (different cpu specifications) | |||
| @@ -174,10 +180,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| #### RISC-V | |||
| - **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||
| - **C910V**: Optimized Level-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. | |||
| ```sh | |||
| make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran | |||
| ``` | |||
| (also known to work on C906) | |||
| ### Support for multiple targets in a single library | |||
| @@ -208,7 +215,8 @@ Please note that it is not possible to combine support for different architectur | |||
| - **Android**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-build-OpenBLAS-for-Android>. | |||
| - **AIX**: Supported on PPC up to POWER8 | |||
| - **Haiku**: Supported by the community. We don't actively test the library on this OS. | |||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS: | |||
| - **SunOS**: Supported by the community. We don't actively test the library on this OS. | |||
| - **Cortex-M**: Supported by the community. Please read <https://github.com/xianyi/OpenBLAS/wiki/How-to-use-OpenBLAS-on-Cortex-M>. | |||
| ## Usage | |||
| @@ -23,6 +23,7 @@ HASWELL | |||
| SKYLAKEX | |||
| ATOM | |||
| COOPERLAKE | |||
| SAPPHIRERAPIDS | |||
| b)AMD CPU: | |||
| ATHLON | |||
| @@ -92,6 +93,9 @@ CORTEXA57 | |||
| CORTEXA72 | |||
| CORTEXA73 | |||
| NEOVERSEN1 | |||
| NEOVERSEV1 | |||
| NEOVERSEN2 | |||
| CORTEXA55 | |||
| EMAG8180 | |||
| FALKOR | |||
| THUNDERX | |||
| @@ -109,3 +113,9 @@ Z14 | |||
| RISCV64_GENERIC | |||
| C910V | |||
| 11.LOONGARCH64: | |||
| LOONGSON3R5 | |||
| 12. Elbrus E2000: | |||
| E2K | |||
| @@ -29,15 +29,15 @@ environment: | |||
| global: | |||
| CONDA_INSTALL_LOCN: C:\\Miniconda36-x64 | |||
| matrix: | |||
| - COMPILER: clang-cl | |||
| WITH_FORTRAN: yes | |||
| - COMPILER: clang-cl | |||
| DYNAMIC_ARCH: ON | |||
| WITH_FORTRAN: no | |||
| - COMPILER: cl | |||
| - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
| DYNAMIC_ARCH: OFF | |||
| WITH_FORTRAN: ignore | |||
| # - COMPILER: clang-cl | |||
| # WITH_FORTRAN: ON | |||
| # - COMPILER: clang-cl | |||
| # DYNAMIC_ARCH: ON | |||
| # WITH_FORTRAN: OFF | |||
| # - COMPILER: cl | |||
| # - COMPILER: MinGW64-gcc-7.2.0-mingw | |||
| # DYNAMIC_ARCH: OFF | |||
| # WITH_FORTRAN: ignore | |||
| - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
| COMPILER: MinGW-gcc-6.3.0-32 | |||
| - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015 | |||
| @@ -46,13 +46,10 @@ environment: | |||
| install: | |||
| - if [%COMPILER%]==[clang-cl] call %CONDA_INSTALL_LOCN%\Scripts\activate.bat | |||
| - if [%COMPILER%]==[clang-cl] conda update --yes -n base conda | |||
| - if [%COMPILER%]==[clang-cl] conda config --add channels conda-forge --force | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake | |||
| - if [%WITH_FORTRAN%]==[no] conda install --yes --quiet ninja | |||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet -c isuruf kitware-ninja | |||
| - if [%WITH_FORTRAN%]==[yes] conda install --yes --quiet flang | |||
| - if [%COMPILER%]==[clang-cl] conda config --set auto_update_conda false | |||
| - if [%COMPILER%]==[clang-cl] conda install --yes --quiet clangdev cmake ninja flang=11.0.1 | |||
| - if [%COMPILER%]==[clang-cl] call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat" x64 | |||
| - if [%COMPILER%]==[clang-cl] set "LIB=%CONDA_INSTALL_LOCN%\Library\lib;%LIB%" | |||
| - if [%COMPILER%]==[clang-cl] set "CPATH=%CONDA_INSTALL_LOCN%\Library\include;%CPATH%" | |||
| @@ -68,15 +65,14 @@ before_build: | |||
| - if [%COMPILER%]==[MinGW64-gcc-7.2.0-mingw] cmake -G "MinGW Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-6.3.0-32] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%COMPILER%]==[MinGW-gcc-5.3.0] cmake -G "MSYS Makefiles" -DNOFORTRAN=1 .. | |||
| - if [%WITH_FORTRAN%]==[no] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[yes] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%WITH_FORTRAN%]==[OFF] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_MT=mt -DMSVC_STATIC_CRT=ON .. | |||
| - if [%WITH_FORTRAN%]==[ON] cmake -G "Ninja" -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_C_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DBUILD_WITHOUT_LAPACK=no -DNOFORTRAN=0 .. | |||
| - if [%USE_OPENMP%]==[ON] cmake -DUSE_OPENMP=ON .. | |||
| - if [%DYNAMIC_ARCH%]==[ON] cmake -DDYNAMIC_ARCH=ON -DDYNAMIC_LIST='CORE2;NEHALEM;SANDYBRIDGE;BULLDOZER;HASWELL' .. | |||
| build_script: | |||
| - cmake --build . | |||
| test_script: | |||
| - echo Running Test | |||
| - cd utest | |||
| - openblas_utest | |||
| - ctest -j2 | |||
| @@ -4,14 +4,22 @@ trigger: | |||
| branches: | |||
| include: | |||
| - develop | |||
| resources: | |||
| containers: | |||
| - container: oneapi-hpckit | |||
| image: intel/oneapi-hpckit:latest | |||
| options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' | |||
| - container: oneapi-basekit | |||
| image: intel/oneapi-basekit:latest | |||
| options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' | |||
| jobs: | |||
| # manylinux1 is useful to test because the | |||
| # standard Docker container uses an old version | |||
| # of gcc / glibc | |||
| - job: manylinux1_gcc | |||
| pool: | |||
| vmImage: 'ubuntu-16.04' | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| echo "FROM quay.io/pypa/manylinux1_x86_64 | |||
| @@ -27,7 +35,7 @@ jobs: | |||
| displayName: Run manylinux1 docker build | |||
| - job: Intel_SDE_skx | |||
| pool: | |||
| vmImage: 'ubuntu-16.04' | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| # at the time of writing the available Azure Ubuntu vm image | |||
| @@ -67,5 +75,189 @@ jobs: | |||
| cd utest | |||
| dir | |||
| openblas_utest.exe | |||
| - job: Windows_mingw_gmake | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| mingw32-make CC=gcc FC=gfortran DYNAMIC_ARCH=1 DYNAMIC_LIST="NEHALEM SANDYBRIDGE HASWELL" | |||
| - job: Windows_clang_cmake | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
| set "CPATH=C:\Miniconda\Library\include;%CPATH% | |||
| conda config --add channels conda-forge --force | |||
| conda config --set auto_update_conda false | |||
| conda install --yes ninja | |||
| call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| mkdir build | |||
| cd build | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DNOFORTRAN=1 -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: Windows_flang_clang | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - script: | | |||
| set "PATH=C:\Miniconda\Scripts;C:\Miniconda\Library\bin;C:\Miniconda\Library\usr\bin;C:\Miniconda\condabin;%PATH%" | |||
| set "LIB=C:\Miniconda\Library\lib;%LIB%" | |||
| set "CPATH=C:\Miniconda\Library\include;%CPATH%" | |||
| conda config --add channels conda-forge --force | |||
| conda config --set auto_update_conda false | |||
| conda install --yes --quiet ninja flang | |||
| mkdir build | |||
| cd build | |||
| call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" | |||
| cmake -G "Ninja" -DCMAKE_C_COMPILER=clang-cl -DCMAKE_CXX_COMPILER=clang-cl -DCMAKE_Fortran_COMPILER=flang -DCMAKE_MT=mt -DCMAKE_BUILD_TYPE=Release -DMSVC_STATIC_CRT=ON .. | |||
| cmake --build . --config Release | |||
| ctest | |||
| - job: OSX_OpenMP | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 | |||
| make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 PREFIX=../blasinst install | |||
| ls -lR ../blasinst | |||
| - job: OSX_GCC_Nothreads | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 | |||
| - job: OSX_OpenMP_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm libomp | |||
| make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 | |||
| - job: OSX_OpenMP_Clang_cmake | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm libomp | |||
| mkdir build | |||
| cd build | |||
| cmake -DTARGET=CORE2 -DUSE_OPENMP=1 -DINTERFACE64=1 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=/usr/local/opt/llvm/bin/clang -DNOFORTRAN=1 -DNO_AVX512=1 .. | |||
| make | |||
| ctest | |||
| - job: OSX_dynarch_cmake | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| steps: | |||
| - script: | | |||
| mkdir build | |||
| cd build | |||
| cmake -DTARGET=CORE2 -DDYNAMIC_ARCH=1 -DCMAKE_C_COMPILER=gcc-10 -DCMAKE_Fortran_COMPILER=gfortran-10 -DBUILD_SHARED_LIBS=ON .. | |||
| cmake --build . | |||
| ctest | |||
| - job: OSX_Ifort_Clang | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| LD_LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg | |||
| LIBRARY_PATH: /usr/local/opt/llvm/lib | |||
| MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install llvm libomp | |||
| sudo mkdir -p /opt/intel | |||
| sudo chown $USER /opt/intel | |||
| displayName: prepare for cache restore | |||
| - task: Cache@2 | |||
| inputs: | |||
| path: /opt/intel/oneapi | |||
| key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"' | |||
| cacheHitVar: CACHE_RESTORED | |||
| - script: | | |||
| curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5 | |||
| hdiutil attach webimage.dmg | |||
| sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=. | |||
| installer_exit_code=$? | |||
| hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet | |||
| exit $installer_exit_code | |||
| displayName: install | |||
| condition: ne(variables.CACHE_RESTORED, 'true') | |||
| - script: | | |||
| source /opt/intel/oneapi/setvars.sh | |||
| make CC=/usr/local/opt/llvm/bin/clang FC=ifort | |||
| - job: OSX_NDK_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| steps: | |||
| - script: | | |||
| brew update | |||
| brew install --cask android-ndk | |||
| export ANDROID_NDK_HOME=/usr/local/share/android-ndk | |||
| make TARGET=ARMV7 ONLY_CBLAS=1 CC=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/armv7a-linux-androideabi21-clang AR=$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/darwin-x86_64/bin/llvm-ar HOSTCC=gcc ARM_SOFTFP_ABI=1 -j4 | |||
| - job: OSX_IOS_ARMV8 | |||
| pool: | |||
| vmImage: 'macOS-11' | |||
| variables: | |||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch arm64 -miphoneos-version-min=10.0 | |||
| steps: | |||
| - script: | | |||
| make TARGET=ARMV8 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: OSX_IOS_ARMV7 | |||
| pool: | |||
| vmImage: 'macOS-10.15' | |||
| variables: | |||
| CC: /Applications/Xcode_12.4.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin/clang | |||
| CFLAGS: -O2 -mno-thumb -Wno-macro-redefined -isysroot /Applications/Xcode_12.4.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS14.4.sdk -arch armv7 -miphoneos-version-min=5.1 | |||
| steps: | |||
| - script: | | |||
| make TARGET=ARMV7 DYNAMIC_ARCH=1 NUM_THREADS=32 HOSTCC=clang NOFORTRAN=1 | |||
| - job: ALPINE_MUSL | |||
| pool: | |||
| vmImage: 'ubuntu-latest' | |||
| steps: | |||
| - script: | | |||
| wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \ | |||
| && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc alpine-chroot-install' | sha1sum -c \ | |||
| || exit 1 | |||
| alpine() { /alpine/enter-chroot -u "$USER" "$@"; } | |||
| sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo' | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 | |||
| alpine make DYNAMIC_ARCH=1 BINARY=64 PREFIX=mytestdir install | |||
| alpine ls -l mytestdir/include | |||
| alpine echo "// tests that inclusion of openblas_config.h works with musl" >test_install.c | |||
| alpine echo "#include <openblas_config.h>" >>test_install.c | |||
| alpine echo "int main(){" >> test_install.c | |||
| alpine echo "cpu_set_t* cpu_set = NULL;}" >>test_install.c | |||
| alpine gcc -Imytestdir/include test_install.c -Lmytestdir/lib -lopenblas -lpthread -lgfortran -o test_install | |||
| @@ -3,6 +3,8 @@ | |||
| #include <time.h> | |||
| #ifdef __CYGWIN32__ | |||
| #include <sys/time.h> | |||
| #elif defined(__APPLE__) | |||
| #include <mach/mach_time.h> | |||
| #endif | |||
| #include "common.h" | |||
| @@ -74,6 +76,9 @@ static void *huge_malloc(BLASLONG size){ | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| struct timeval start, stop; | |||
| #elif defined(__APPLE__) | |||
| mach_timebase_info_data_t info; | |||
| uint64_t start = 0, stop = 0; | |||
| #else | |||
| struct timespec start = { 0, 0 }, stop = { 0, 0 }; | |||
| #endif | |||
| @@ -82,6 +87,9 @@ double getsec() | |||
| { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| #elif defined(__APPLE__) | |||
| mach_timebase_info(&info); | |||
| return (double)(((stop - start) * info.numer)/info.denom) * 1.e-9; | |||
| #else | |||
| return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; | |||
| #endif | |||
| @@ -90,6 +98,8 @@ double getsec() | |||
| void begin() { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #elif defined(__APPLE__) | |||
| start = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||
| #else | |||
| clock_gettime(CLOCK_REALTIME, &start); | |||
| #endif | |||
| @@ -98,7 +108,9 @@ void begin() { | |||
| void end() { | |||
| #if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| #elif defined(__APPLE__) | |||
| stop = clock_gettime_nsec_np(CLOCK_UPTIME_RAW); | |||
| #else | |||
| clock_gettime(CLOCK_REALTIME, &stop); | |||
| #endif | |||
| } | |||
| } | |||
| @@ -125,7 +125,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| @@ -162,7 +162,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[(long)j + (long)i * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| @@ -72,13 +72,17 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a,*work; | |||
| FLOAT wkopt[4]; | |||
| blasint *ipiv; | |||
| blasint m, i, j, info,lwork; | |||
| blasint m, i, j, l, info,lwork; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| double time1,timeg; | |||
| char *p; | |||
| char btest = 'I'; | |||
| argc--;argv++; | |||
| @@ -86,6 +90,9 @@ int main(int argc, char *argv[]){ | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d\n", from, to, step); | |||
| @@ -124,32 +131,41 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " SIZE FLops Time Lwork\n"); | |||
| for(m = from; m <= to; m += step){ | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| GETRF (&m, &m, a, &m, ipiv, &info); | |||
| for (l = 0; l < loops; l++) { | |||
| if (btest == 'F') begin(); | |||
| GETRF (&m, &m, a, &m, ipiv, &info); | |||
| if (btest == 'F') { | |||
| end(); | |||
| timeg += getsec(); | |||
| } | |||
| if (info) { | |||
| fprintf(stderr, "Matrix is not singular .. %d\n", info); | |||
| exit(1); | |||
| } | |||
| begin(); | |||
| if (btest == 'I') begin(); | |||
| lwork = -1; | |||
| GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); | |||
| lwork = (blasint)wkopt[0]; | |||
| GETRI(&m, a, &m, ipiv, work, &lwork, &info); | |||
| end(); | |||
| if (btest == 'I') end(); | |||
| if (info) { | |||
| fprintf(stderr, "failed compute inverse matrix .. %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| if (btest == 'I') | |||
| timeg += getsec(); | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops : %10.2f Sec : %d\n", | |||
| COMPSIZE * COMPSIZE * (4.0/3.0 * (double)m * (double)m *(double)m - (double)m *(double)m + 5.0/3.0* (double)m) / time1 * 1.e-6,time1,lwork); | |||
| @@ -72,17 +72,21 @@ int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b; | |||
| blasint *ipiv; | |||
| blasint m, i, j, info; | |||
| blasint m, i, j, l, info; | |||
| blasint unit = 1; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| FLOAT maxerr; | |||
| double time1, time2; | |||
| double time1, time2, timeg1,timeg2; | |||
| char *p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| argc--;argv++; | |||
| if (argc > 0) { from = atol(*argv); argc--; argv++;} | |||
| @@ -110,9 +114,9 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, " SIZE Residual Decompose Solve Total\n"); | |||
| for(m = from; m <= to; m += step){ | |||
| timeg1 = timeg2 = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) { | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -138,7 +142,7 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| timeg1 += getsec(); | |||
| begin(); | |||
| @@ -151,8 +155,10 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time2 = getsec(); | |||
| timeg2 += getsec(); | |||
| } //loops | |||
| time1=timeg1/(double)loops; | |||
| time2=timeg2/(double)loops; | |||
| maxerr = 0.; | |||
| for(i = 0; i < m; i++){ | |||
| @@ -99,14 +99,15 @@ int main(int argc, char *argv[]){ | |||
| char *p; | |||
| char btest = 'F'; | |||
| blasint m, i, j, info, uplos=0; | |||
| double flops; | |||
| blasint m, i, j, l, info, uplos=0; | |||
| double flops = 0.; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| double time1, timeg; | |||
| argc--;argv++; | |||
| @@ -119,6 +120,8 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_TEST"))) btest=*p; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Uplo = %c\n", from, to, step,*uplo[uplos]); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| @@ -129,19 +132,21 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| for(m = from; m <= to; m += step){ | |||
| for(m = from; m <= to; m += step){ | |||
| timeg=0.; | |||
| for (l = 0; l < loops; l++) { | |||
| #ifndef COMPLEX | |||
| if (uplos & 1) { | |||
| for (j = 0; j < m; j++) { | |||
| for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = 0.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; | |||
| } | |||
| } else { | |||
| for (j = 0; j < m; j++) { | |||
| for(i = 0; i < j; i++) a[(long)i + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) - 0.5; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| a[(long)j + (long)j * (long)m] = ((double) rand() / (double) RAND_MAX) + 8.; | |||
| for(i = j + 1; i < m; i++) a[(long)i + (long)j * (long)m] = 0.; | |||
| } | |||
| } | |||
| @@ -192,8 +197,8 @@ int main(int argc, char *argv[]){ | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; | |||
| if ( btest == 'F') | |||
| timeg += getsec(); | |||
| if ( btest == 'S' ) | |||
| { | |||
| @@ -214,9 +219,7 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, "Potrs info = %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; | |||
| timeg += getsec(); | |||
| } | |||
| if ( btest == 'I' ) | |||
| @@ -232,11 +235,17 @@ int main(int argc, char *argv[]){ | |||
| fprintf(stderr, "Potri info = %d\n", info); | |||
| exit(1); | |||
| } | |||
| time1 = getsec(); | |||
| flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; | |||
| timeg += getsec(); | |||
| } | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| if ( btest == 'F') | |||
| flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; | |||
| if ( btest == 'S') | |||
| flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; | |||
| if ( btest == 'I') | |||
| flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; | |||
| fprintf(stderr, "%8d : %10.2f MFlops : %10.3f Sec : Test=%c\n",m,flops ,time1,btest); | |||
| @@ -46,14 +46,17 @@ int main(int argc, char *argv[]){ | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| blasint m, i, j; | |||
| blasint m, i, j, l; | |||
| blasint inc_x= 1; | |||
| blasint inc_y= 1; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| double time1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| @@ -85,8 +88,9 @@ int main(int argc, char *argv[]){ | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for (l = 0; l < loops; l++) { | |||
| for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| @@ -107,8 +111,10 @@ int main(int argc, char *argv[]){ | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += getsec(); | |||
| } // loops | |||
| time1 = timeg/(double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)m / time1 * 1.e-6); | |||
| @@ -56,17 +56,20 @@ int main(int argc, char *argv[]){ | |||
| char uplo='U'; | |||
| char trans='N'; | |||
| if ((p = getenv("OPENBLAS_UPLO"))) uplo=*p; | |||
| if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; | |||
| blasint m, i, j; | |||
| blasint m, i, j, l; | |||
| int from = 1; | |||
| int to = 200; | |||
| int step = 1; | |||
| int loops = 1; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops=*p; | |||
| double time1; | |||
| double time1,timeg; | |||
| argc--;argv++; | |||
| @@ -95,9 +98,12 @@ int main(int argc, char *argv[]){ | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg = 0.; | |||
| fprintf(stderr, " %6d : ", (int)m); | |||
| for(l = 0; l < loops; l++) { | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < m * COMPSIZE; i++){ | |||
| a[(long)i + (long)j * (long)m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -111,8 +117,10 @@ int main(int argc, char *argv[]){ | |||
| end(); | |||
| time1 = getsec(); | |||
| timeg += getsec(); | |||
| } //loops | |||
| time1 = timeg / (double)loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); | |||
| @@ -1,11 +1,11 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| #use File::Basename; | |||
| # use File::Temp qw(tempfile); | |||
| # Checking cross compile | |||
| $hostos = `uname -s | sed -e s/\-.*//`; chop($hostos); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`;chop($hostarch); | |||
| $hostarch = `uname -m | sed -e s/i.86/x86/`; | |||
| $hostarch = `uname -p` if ($hostos eq "AIX" || $hostos eq "SunOS"); | |||
| chop($hostarch); | |||
| $hostarch = "x86_64" if ($hostarch eq "amd64"); | |||
| @@ -82,18 +82,20 @@ $os = Interix if ($data =~ /OS_INTERIX/); | |||
| $os = Android if ($data =~ /OS_ANDROID/); | |||
| $os = Haiku if ($data =~ /OS_HAIKU/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = riscv64 if ($data =~ /ARCH_RISCV64/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $defined = 0; | |||
| @@ -123,6 +125,11 @@ if ($architecture eq "zarch") { | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "e2k") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "alpha") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| @@ -143,6 +150,11 @@ if ($architecture eq "riscv64") { | |||
| $binary = 64; | |||
| } | |||
| if ($architecture eq "loongarch64") { | |||
| $defined = 1; | |||
| $binary = 64; | |||
| } | |||
| if ($compiler eq "PGI") { | |||
| $compiler_name .= " -tp p7" if ($binary eq "32"); | |||
| $compiler_name .= " -tp p7-64" if ($binary eq "64"); | |||
| @@ -199,7 +211,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| } else { | |||
| $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); | |||
| $code = '"addvi.b $w0, $w1, 1"'; | |||
| $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; | |||
| $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; | |||
| print $tmpf "#include <msa.h>\n\n"; | |||
| print $tmpf "void main(void){ __asm__ volatile($code); }\n"; | |||
| @@ -215,17 +227,19 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { | |||
| } | |||
| } | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = x86 if ($data =~ /ARCH_X86/); | |||
| $architecture = x86_64 if ($data =~ /ARCH_X86_64/); | |||
| $architecture = e2k if ($data =~ /ARCH_E2K/); | |||
| $architecture = power if ($data =~ /ARCH_POWER/); | |||
| $architecture = mips if ($data =~ /ARCH_MIPS/); | |||
| $architecture = mips64 if ($data =~ /ARCH_MIPS64/); | |||
| $architecture = alpha if ($data =~ /ARCH_ALPHA/); | |||
| $architecture = sparc if ($data =~ /ARCH_SPARC/); | |||
| $architecture = ia64 if ($data =~ /ARCH_IA64/); | |||
| $architecture = arm if ($data =~ /ARCH_ARM/); | |||
| $architecture = arm64 if ($data =~ /ARCH_ARM64/); | |||
| $architecture = zarch if ($data =~ /ARCH_ZARCH/); | |||
| $architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/); | |||
| $binformat = bin32; | |||
| $binformat = bin64 if ($data =~ /BINARY_64/); | |||
| @@ -125,9 +125,14 @@ void cblas_zswap(OPENBLAS_CONST blasint n, void *x, OPENBLAS_CONST blasint incx, | |||
| void cblas_srot(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_drot(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_csrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float c, OPENBLAS_CONST float s); | |||
| void cblas_zdrot(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double c, OPENBLAS_CONST double s); | |||
| void cblas_srotg(float *a, float *b, float *c, float *s); | |||
| void cblas_drotg(double *a, double *b, double *c, double *s); | |||
| void cblas_crotg(void *a, void *b, float *c, void *s); | |||
| void cblas_zrotg(void *a, void *b, double *c, void *s); | |||
| void cblas_srotm(OPENBLAS_CONST blasint N, float *X, OPENBLAS_CONST blasint incX, float *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST float *P); | |||
| void cblas_drotm(OPENBLAS_CONST blasint N, double *X, OPENBLAS_CONST blasint incX, double *Y, OPENBLAS_CONST blasint incY, OPENBLAS_CONST double *P); | |||
| @@ -395,6 +400,8 @@ void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE | |||
| float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); | |||
| void cblas_sbgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K, | |||
| OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc); | |||
| #ifdef __cplusplus | |||
| } | |||
| #endif /* __cplusplus */ | |||
| @@ -44,7 +44,10 @@ endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110) | |||
| set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 NEOVERSEV1 NEOVERSEN2 THUNDERX3T110) | |||
| if (DYNAMIC_LIST) | |||
| set(DYNAMIC_CORE ARMV8 ${DYNAMIC_LIST}) | |||
| endif () | |||
| endif () | |||
| if (POWER) | |||
| @@ -106,7 +109,11 @@ if (${ARCH} STREQUAL "ia64") | |||
| endif () | |||
| endif () | |||
| if (MIPS64) | |||
| if (MIPS32 OR MIPS64) | |||
| set(NO_BINARY_MODE 1) | |||
| endif () | |||
| if (LOONGARCH64) | |||
| set(NO_BINARY_MODE 1) | |||
| endif () | |||
| @@ -15,6 +15,11 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS | |||
| if (NO_BINARY_MODE) | |||
| if (MIPS32) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=32") | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| if (MIPS64) | |||
| if (BINARY64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=64") | |||
| @@ -29,6 +34,15 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -march=mips64") | |||
| endif () | |||
| if (LOONGARCH64) | |||
| if (BINARY64) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64") | |||
| else () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32") | |||
| endif () | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| if (CMAKE_SYSTEM_NAME STREQUAL "AIX") | |||
| set(BINARY_DEFINED 1) | |||
| endif () | |||
| @@ -117,6 +131,65 @@ if (${CORE} STREQUAL COOPERLAKE) | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL SAPPHIRERAPIDS) | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (NOT NO_AVX512) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=sapphirerapids") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=skylake-avx512") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL A64FX) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 11.0 OR ${GCC_VERSION} VERSION_EQUAL 11.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve -mtune=a64fx") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8.2-a+sve") | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL ARMV8SVE) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -march=armv8-a+sve") | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER10) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
| else () | |||
| message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10." ) | |||
| endif() | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER9) | |||
| if (NOT DYNAMIC_ARCH) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
| else () | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| message(WARNING "Compiler GCC.${GCC_VERSION} does not fully support Power9.") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (${CORE} STREQUAL POWER8) | |||
| if (NOT DYNAMIC_ARCH) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| endif () | |||
| endif () | |||
| if (NOT DYNAMIC_ARCH) | |||
| if (HAVE_AVX2) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mavx2") | |||
| @@ -124,9 +197,9 @@ if (NOT DYNAMIC_ARCH) | |||
| if (HAVE_AVX) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") | |||
| endif () | |||
| if (HAVE_FMA3) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") | |||
| endif () | |||
| # if (HAVE_FMA3) | |||
| #set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") | |||
| #endif () | |||
| if (HAVE_SSE) | |||
| set (CCOMMON_OPT "${CCOMMON_OPT} -msse") | |||
| endif () | |||
| @@ -20,19 +20,16 @@ | |||
| # NEEDBUNDERSCORE | |||
| # NEED2UNDERSCORES | |||
| if (NOT NO_LAPACK) | |||
| include(CheckLanguage) | |||
| check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| include(CheckLanguage) | |||
| check_language(Fortran) | |||
| if(CMAKE_Fortran_COMPILER) | |||
| enable_language(Fortran) | |||
| else() | |||
| if (NOT NO_LAPACK) | |||
| message(STATUS "No Fortran compiler found, can build only BLAS but not LAPACK") | |||
| endif() | |||
| set (NOFORTRAN 1) | |||
| set (NO_LAPACK 1) | |||
| endif() | |||
| else() | |||
| include(CMakeForceCompiler) | |||
| CMAKE_FORCE_Fortran_COMPILER(gfortran GNU) | |||
| endif() | |||
| if (NOT ONLY_CBLAS) | |||
| @@ -3,11 +3,6 @@ | |||
| ## Description: Ported from portion of OpenBLAS/Makefile.system | |||
| ## Sets Fortran related variables. | |||
| if (INTERFACE64) | |||
| set(SUFFIX64 64) | |||
| set(SUFFIX64_UNDERSCORE _64) | |||
| endif() | |||
| if (${F_COMPILER} STREQUAL "FLANG") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_FLANG") | |||
| if (BINARY64 AND INTERFACE64) | |||
| @@ -61,6 +56,13 @@ if (${F_COMPILER} STREQUAL "GFORTRAN") | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=n32") | |||
| endif () | |||
| endif () | |||
| if (LOONGARCH64) | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64") | |||
| else () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32") | |||
| endif () | |||
| endif () | |||
| else () | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -m64") | |||
| @@ -97,7 +99,7 @@ endif () | |||
| if (${F_COMPILER} STREQUAL "IBM") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DF_INTERFACE_IBM") | |||
| # FCOMMON_OPT += -qarch=440 | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -qrecur") | |||
| if (BINARY64) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -q64") | |||
| if (INTERFACE64) | |||
| @@ -1,212 +1,218 @@ | |||
| # helper functions for the kernel CMakeLists.txt | |||
| function(SetFallback KERNEL SOURCE_PATH) | |||
| if (NOT (DEFINED ${KERNEL})) | |||
| set(${KERNEL} ${SOURCE_PATH} PARENT_SCOPE) | |||
| endif () | |||
| endfunction() | |||
| # Set the default filenames for L1 objects. Most of these will be overridden by the appropriate KERNEL file. | |||
| macro(SetDefaultL1) | |||
| set(SAMAXKERNEL amax.S) | |||
| set(DAMAXKERNEL amax.S) | |||
| set(QAMAXKERNEL amax.S) | |||
| set(CAMAXKERNEL zamax.S) | |||
| set(ZAMAXKERNEL zamax.S) | |||
| set(XAMAXKERNEL zamax.S) | |||
| set(SAMINKERNEL amin.S) | |||
| set(DAMINKERNEL amin.S) | |||
| set(QAMINKERNEL amin.S) | |||
| set(CAMINKERNEL zamin.S) | |||
| set(ZAMINKERNEL zamin.S) | |||
| set(XAMINKERNEL zamin.S) | |||
| set(SMAXKERNEL max.S) | |||
| set(DMAXKERNEL max.S) | |||
| set(QMAXKERNEL max.S) | |||
| set(SMINKERNEL min.S) | |||
| set(DMINKERNEL min.S) | |||
| set(QMINKERNEL min.S) | |||
| set(ISAMAXKERNEL iamax.S) | |||
| set(IDAMAXKERNEL iamax.S) | |||
| set(IQAMAXKERNEL iamax.S) | |||
| set(ICAMAXKERNEL izamax.S) | |||
| set(IZAMAXKERNEL izamax.S) | |||
| set(IXAMAXKERNEL izamax.S) | |||
| set(ISAMINKERNEL iamin.S) | |||
| set(IDAMINKERNEL iamin.S) | |||
| set(IQAMINKERNEL iamin.S) | |||
| set(ICAMINKERNEL izamin.S) | |||
| set(IZAMINKERNEL izamin.S) | |||
| set(IXAMINKERNEL izamin.S) | |||
| set(ISMAXKERNEL iamax.S) | |||
| set(IDMAXKERNEL iamax.S) | |||
| set(IQMAXKERNEL iamax.S) | |||
| set(ISMINKERNEL iamin.S) | |||
| set(IDMINKERNEL iamin.S) | |||
| set(IQMINKERNEL iamin.S) | |||
| set(SASUMKERNEL asum.S) | |||
| set(DASUMKERNEL asum.S) | |||
| set(CASUMKERNEL zasum.S) | |||
| set(ZASUMKERNEL zasum.S) | |||
| set(QASUMKERNEL asum.S) | |||
| set(XASUMKERNEL zasum.S) | |||
| set(SAXPYKERNEL axpy.S) | |||
| set(DAXPYKERNEL axpy.S) | |||
| set(CAXPYKERNEL zaxpy.S) | |||
| set(ZAXPYKERNEL zaxpy.S) | |||
| set(QAXPYKERNEL axpy.S) | |||
| set(XAXPYKERNEL zaxpy.S) | |||
| set(SCOPYKERNEL copy.S) | |||
| set(DCOPYKERNEL copy.S) | |||
| set(CCOPYKERNEL zcopy.S) | |||
| set(ZCOPYKERNEL zcopy.S) | |||
| set(QCOPYKERNEL copy.S) | |||
| set(XCOPYKERNEL zcopy.S) | |||
| set(SDOTKERNEL dot.S) | |||
| set(DDOTKERNEL dot.S) | |||
| set(CDOTKERNEL zdot.S) | |||
| set(ZDOTKERNEL zdot.S) | |||
| set(QDOTKERNEL dot.S) | |||
| set(XDOTKERNEL zdot.S) | |||
| set(SNRM2KERNEL nrm2.S) | |||
| set(DNRM2KERNEL nrm2.S) | |||
| set(QNRM2KERNEL nrm2.S) | |||
| set(CNRM2KERNEL znrm2.S) | |||
| set(ZNRM2KERNEL znrm2.S) | |||
| set(XNRM2KERNEL znrm2.S) | |||
| set(SROTKERNEL rot.S) | |||
| set(DROTKERNEL rot.S) | |||
| set(QROTKERNEL rot.S) | |||
| set(CROTKERNEL zrot.S) | |||
| set(ZROTKERNEL zrot.S) | |||
| set(XROTKERNEL zrot.S) | |||
| set(SSCALKERNEL scal.S) | |||
| set(DSCALKERNEL scal.S) | |||
| set(CSCALKERNEL zscal.S) | |||
| set(ZSCALKERNEL zscal.S) | |||
| set(QSCALKERNEL scal.S) | |||
| set(XSCALKERNEL zscal.S) | |||
| set(SSWAPKERNEL swap.S) | |||
| set(DSWAPKERNEL swap.S) | |||
| set(CSWAPKERNEL zswap.S) | |||
| set(ZSWAPKERNEL zswap.S) | |||
| set(QSWAPKERNEL swap.S) | |||
| set(XSWAPKERNEL zswap.S) | |||
| set(SGEMVNKERNEL gemv_n.S) | |||
| set(SGEMVTKERNEL gemv_t.S) | |||
| set(DGEMVNKERNEL gemv_n.S) | |||
| set(DGEMVTKERNEL gemv_t.S) | |||
| set(CGEMVNKERNEL zgemv_n.S) | |||
| set(CGEMVTKERNEL zgemv_t.S) | |||
| set(ZGEMVNKERNEL zgemv_n.S) | |||
| set(ZGEMVTKERNEL zgemv_t.S) | |||
| set(QGEMVNKERNEL gemv_n.S) | |||
| set(QGEMVTKERNEL gemv_t.S) | |||
| set(XGEMVNKERNEL zgemv_n.S) | |||
| set(XGEMVTKERNEL zgemv_t.S) | |||
| set(SCABS_KERNEL ../generic/cabs.c) | |||
| set(DCABS_KERNEL ../generic/cabs.c) | |||
| set(QCABS_KERNEL ../generic/cabs.c) | |||
| set(LSAME_KERNEL ../generic/lsame.c) | |||
| set(SAXPBYKERNEL ../arm/axpby.c) | |||
| set(DAXPBYKERNEL ../arm/axpby.c) | |||
| set(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| set(SSUMKERNEL sum.S) | |||
| set(DSUMKERNEL sum.S) | |||
| set(CSUMKERNEL zsum.S) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| SetFallback(SAMAXKERNEL amax.S) | |||
| SetFallback(DAMAXKERNEL amax.S) | |||
| SetFallback(QAMAXKERNEL amax.S) | |||
| SetFallback(CAMAXKERNEL zamax.S) | |||
| SetFallback(ZAMAXKERNEL zamax.S) | |||
| SetFallback(XAMAXKERNEL zamax.S) | |||
| SetFallback(SAMINKERNEL amin.S) | |||
| SetFallback(DAMINKERNEL amin.S) | |||
| SetFallback(QAMINKERNEL amin.S) | |||
| SetFallback(CAMINKERNEL zamin.S) | |||
| SetFallback(ZAMINKERNEL zamin.S) | |||
| SetFallback(XAMINKERNEL zamin.S) | |||
| SetFallback(SMAXKERNEL max.S) | |||
| SetFallback(DMAXKERNEL max.S) | |||
| SetFallback(QMAXKERNEL max.S) | |||
| SetFallback(SMINKERNEL min.S) | |||
| SetFallback(DMINKERNEL min.S) | |||
| SetFallback(QMINKERNEL min.S) | |||
| SetFallback(ISAMAXKERNEL iamax.S) | |||
| SetFallback(IDAMAXKERNEL iamax.S) | |||
| SetFallback(IQAMAXKERNEL iamax.S) | |||
| SetFallback(ICAMAXKERNEL izamax.S) | |||
| SetFallback(IZAMAXKERNEL izamax.S) | |||
| SetFallback(IXAMAXKERNEL izamax.S) | |||
| SetFallback(ISAMINKERNEL iamin.S) | |||
| SetFallback(IDAMINKERNEL iamin.S) | |||
| SetFallback(IQAMINKERNEL iamin.S) | |||
| SetFallback(ICAMINKERNEL izamin.S) | |||
| SetFallback(IZAMINKERNEL izamin.S) | |||
| SetFallback(IXAMINKERNEL izamin.S) | |||
| SetFallback(ISMAXKERNEL iamax.S) | |||
| SetFallback(IDMAXKERNEL iamax.S) | |||
| SetFallback(IQMAXKERNEL iamax.S) | |||
| SetFallback(ISMINKERNEL iamin.S) | |||
| SetFallback(IDMINKERNEL iamin.S) | |||
| SetFallback(IQMINKERNEL iamin.S) | |||
| SetFallback(SASUMKERNEL asum.S) | |||
| SetFallback(DASUMKERNEL asum.S) | |||
| SetFallback(CASUMKERNEL zasum.S) | |||
| SetFallback(ZASUMKERNEL zasum.S) | |||
| SetFallback(QASUMKERNEL asum.S) | |||
| SetFallback(XASUMKERNEL zasum.S) | |||
| SetFallback(SAXPYKERNEL axpy.S) | |||
| SetFallback(DAXPYKERNEL axpy.S) | |||
| SetFallback(CAXPYKERNEL zaxpy.S) | |||
| SetFallback(ZAXPYKERNEL zaxpy.S) | |||
| SetFallback(QAXPYKERNEL axpy.S) | |||
| SetFallback(XAXPYKERNEL zaxpy.S) | |||
| SetFallback(SCOPYKERNEL copy.S) | |||
| SetFallback(DCOPYKERNEL copy.S) | |||
| SetFallback(CCOPYKERNEL zcopy.S) | |||
| SetFallback(ZCOPYKERNEL zcopy.S) | |||
| SetFallback(QCOPYKERNEL copy.S) | |||
| SetFallback(XCOPYKERNEL zcopy.S) | |||
| SetFallback(SDOTKERNEL dot.S) | |||
| SetFallback(DDOTKERNEL dot.S) | |||
| SetFallback(CDOTKERNEL zdot.S) | |||
| SetFallback(ZDOTKERNEL zdot.S) | |||
| SetFallback(QDOTKERNEL dot.S) | |||
| SetFallback(XDOTKERNEL zdot.S) | |||
| SetFallback(SNRM2KERNEL nrm2.S) | |||
| SetFallback(DNRM2KERNEL nrm2.S) | |||
| SetFallback(QNRM2KERNEL nrm2.S) | |||
| SetFallback(CNRM2KERNEL znrm2.S) | |||
| SetFallback(ZNRM2KERNEL znrm2.S) | |||
| SetFallback(XNRM2KERNEL znrm2.S) | |||
| SetFallback(SROTKERNEL rot.S) | |||
| SetFallback(DROTKERNEL rot.S) | |||
| SetFallback(QROTKERNEL rot.S) | |||
| SetFallback(CROTKERNEL zrot.S) | |||
| SetFallback(ZROTKERNEL zrot.S) | |||
| SetFallback(XROTKERNEL zrot.S) | |||
| SetFallback(SSCALKERNEL scal.S) | |||
| SetFallback(DSCALKERNEL scal.S) | |||
| SetFallback(CSCALKERNEL zscal.S) | |||
| SetFallback(ZSCALKERNEL zscal.S) | |||
| SetFallback(QSCALKERNEL scal.S) | |||
| SetFallback(XSCALKERNEL zscal.S) | |||
| SetFallback(SSWAPKERNEL swap.S) | |||
| SetFallback(DSWAPKERNEL swap.S) | |||
| SetFallback(CSWAPKERNEL zswap.S) | |||
| SetFallback(ZSWAPKERNEL zswap.S) | |||
| SetFallback(QSWAPKERNEL swap.S) | |||
| SetFallback(XSWAPKERNEL zswap.S) | |||
| SetFallback(SGEMVNKERNEL gemv_n.S) | |||
| SetFallback(SGEMVTKERNEL gemv_t.S) | |||
| SetFallback(DGEMVNKERNEL gemv_n.S) | |||
| SetFallback(DGEMVTKERNEL gemv_t.S) | |||
| SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(QGEMVNKERNEL gemv_n.S) | |||
| SetFallback(QGEMVTKERNEL gemv_t.S) | |||
| SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(SCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(DCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(QCABS_KERNEL ../generic/cabs.c) | |||
| SetFallback(LSAME_KERNEL ../generic/lsame.c) | |||
| SetFallback(SAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(DAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(CAXPBYKERNEL ../arm/zaxpby.c) | |||
| SetFallback(ZAXPBYKERNEL ../arm/zaxpby.c) | |||
| SetFallback(SSUMKERNEL sum.S) | |||
| SetFallback(DSUMKERNEL sum.S) | |||
| SetFallback(CSUMKERNEL zsum.S) | |||
| SetFallback(ZSUMKERNEL zsum.S) | |||
| SetFallback(QSUMKERNEL sum.S) | |||
| SetFallback(XSUMKERNEL zsum.S) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| set(SHMINKERNEL ../arm/min.c) | |||
| set(ISHAMAXKERNEL ../arm/iamax.c) | |||
| set(ISHAMINKERNEL ../arm/iamin.c) | |||
| set(ISHMAXKERNEL ../arm/imax.c) | |||
| set(ISHMINKERNEL ../arm/imin.c) | |||
| set(SHASUMKERNEL ../arm/asum.c) | |||
| set(SHAXPYKERNEL ../arm/axpy.c) | |||
| set(SHAXPBYKERNEL ../arm/axpby.c) | |||
| set(SHCOPYKERNEL ../arm/copy.c) | |||
| set(SBDOTKERNEL ../x86_64/sbdot.c) | |||
| set(SHROTKERNEL ../arm/rot.c) | |||
| set(SHSCALKERNEL ../arm/scal.c) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| set(SHSUMKERNEL ../arm/sum.c) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| set(TOBF16KERNEL ../x86_64/tobf16.c) | |||
| set(BF16TOKERNEL ../x86_64/bf16to.c) | |||
| SetFallback(SHAMINKERNEL ../arm/amin.c) | |||
| SetFallback(SHAMAXKERNEL ../arm/amax.c) | |||
| SetFallback(SHMAXKERNEL ../arm/max.c) | |||
| SetFallback(SHMINKERNEL ../arm/min.c) | |||
| SetFallback(ISHAMAXKERNEL ../arm/iamax.c) | |||
| SetFallback(ISHAMINKERNEL ../arm/iamin.c) | |||
| SetFallback(ISHMAXKERNEL ../arm/imax.c) | |||
| SetFallback(ISHMINKERNEL ../arm/imin.c) | |||
| SetFallback(SHASUMKERNEL ../arm/asum.c) | |||
| SetFallback(SHAXPYKERNEL ../arm/axpy.c) | |||
| SetFallback(SHAXPBYKERNEL ../arm/axpby.c) | |||
| SetFallback(SHCOPYKERNEL ../arm/copy.c) | |||
| SetFallback(SBDOTKERNEL ../x86_64/sbdot.c) | |||
| SetFallback(SHROTKERNEL ../arm/rot.c) | |||
| SetFallback(SHSCALKERNEL ../arm/scal.c) | |||
| SetFallback(SHNRM2KERNEL ../arm/nrm2.c) | |||
| SetFallback(SHSUMKERNEL ../arm/sum.c) | |||
| SetFallback(SHSWAPKERNEL ../arm/swap.c) | |||
| SetFallback(TOBF16KERNEL ../x86_64/tobf16.c) | |||
| SetFallback(BF16TOKERNEL ../x86_64/bf16to.c) | |||
| SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| set(SGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(DGEMVNKERNEL gemv_n.S) | |||
| set(DGEMVTKERNEL gemv_t.S) | |||
| set(CGEMVNKERNEL zgemv_n.S) | |||
| set(CGEMVTKERNEL zgemv_t.S) | |||
| set(ZGEMVNKERNEL zgemv_n.S) | |||
| set(ZGEMVTKERNEL zgemv_t.S) | |||
| set(QGEMVNKERNEL gemv_n.S) | |||
| set(QGEMVTKERNEL gemv_t.S) | |||
| set(XGEMVNKERNEL zgemv_n.S) | |||
| set(XGEMVTKERNEL zgemv_t.S) | |||
| set(SGERKERNEL ../generic/ger.c) | |||
| set(DGERKERNEL ../generic/ger.c) | |||
| set(QGERKERNEL ../generic/ger.c) | |||
| set(CGERUKERNEL ../generic/zger.c) | |||
| set(CGERCKERNEL ../generic/zger.c) | |||
| set(ZGERUKERNEL ../generic/zger.c) | |||
| set(ZGERCKERNEL ../generic/zger.c) | |||
| set(XGERUKERNEL ../generic/zger.c) | |||
| set(XGERCKERNEL ../generic/zger.c) | |||
| set(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
| set(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
| set(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| set(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| set(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(SGEMVNKERNEL ../arm/gemv_n.c) | |||
| SetFallback(SGEMVTKERNEL ../arm/gemv_t.c) | |||
| SetFallback(DGEMVNKERNEL gemv_n.S) | |||
| SetFallback(DGEMVTKERNEL gemv_t.S) | |||
| SetFallback(CGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(CGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(ZGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(ZGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(QGEMVNKERNEL gemv_n.S) | |||
| SetFallback(QGEMVTKERNEL gemv_t.S) | |||
| SetFallback(XGEMVNKERNEL zgemv_n.S) | |||
| SetFallback(XGEMVTKERNEL zgemv_t.S) | |||
| SetFallback(SGERKERNEL ../generic/ger.c) | |||
| SetFallback(DGERKERNEL ../generic/ger.c) | |||
| SetFallback(QGERKERNEL ../generic/ger.c) | |||
| SetFallback(CGERUKERNEL ../generic/zger.c) | |||
| SetFallback(CGERCKERNEL ../generic/zger.c) | |||
| SetFallback(ZGERUKERNEL ../generic/zger.c) | |||
| SetFallback(ZGERCKERNEL ../generic/zger.c) | |||
| SetFallback(XGERUKERNEL ../generic/zger.c) | |||
| SetFallback(XGERCKERNEL ../generic/zger.c) | |||
| SetFallback(SSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(SSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(DSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(DSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(QSYMV_U_KERNEL ../generic/symv_k.c) | |||
| SetFallback(QSYMV_L_KERNEL ../generic/symv_k.c) | |||
| SetFallback(CSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(CSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(ZSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(ZSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(XSYMV_U_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(XSYMV_L_KERNEL ../generic/zsymv_k.c) | |||
| SetFallback(CHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(CHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(ZHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_U_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| SetFallback(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_BFLOAT16) | |||
| set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| SetFallback(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) | |||
| SetFallback(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) | |||
| SetFallback(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL3) | |||
| set(SGEADD_KERNEL ../generic/geadd.c) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| SetFallback(SGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(DGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| SetFallback(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_BFLOAT16) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SBGEMM_BETA ../generic/gemm_beta.c) | |||
| set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
| set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
| set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
| set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
| SetFallback(SHGEADD_KERNEL ../generic/geadd.c) | |||
| SetFallback(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| SetFallback(SBGEMM_BETA ../generic/gemm_beta.c) | |||
| SetFallback(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| SetFallback(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| SetFallback(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| SetFallback(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| SetFallback(SBGEMMINCOPYOBJ sbgemm_incopy.o) | |||
| SetFallback(SBGEMMITCOPYOBJ sbgemm_itcopy.o) | |||
| SetFallback(SBGEMMONCOPYOBJ sbgemm_oncopy.o) | |||
| SetFallback(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -66,7 +66,7 @@ set(SLASRC | |||
| slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f | |||
| slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f | |||
| slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f | |||
| slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f | |||
| slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f | |||
| slarrv.f slartv.f | |||
| slarz.f slarzb.f slarzt.f slasy2.f | |||
| slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f | |||
| @@ -112,14 +112,14 @@ set(SLASRC | |||
| sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f | |||
| stpqrt.f stpqrt2.f stpmqrt.f stprfb.f | |||
| sgelqt.f sgelqt3.f sgemlqt.f | |||
| sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f | |||
| sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f | |||
| sgelq.f slaswlq.f slamswlq.f sgemlq.f | |||
| stplqt.f stplqt2.f stpmlqt.f | |||
| ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f | |||
| ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f | |||
| ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f | |||
| sgesvdq.f slaorhr_col_getrfnp.f | |||
| slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) | |||
| slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) | |||
| set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f | |||
| sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f | |||
| @@ -171,7 +171,7 @@ set(CLASRC | |||
| claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f | |||
| claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f | |||
| claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f | |||
| clarf.f clarfb.f clarfg.f clarfgp.f clarft.f | |||
| clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f | |||
| clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f | |||
| clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f | |||
| clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f | |||
| @@ -209,14 +209,14 @@ set(CLASRC | |||
| cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f | |||
| ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f | |||
| cgelqt.f cgelqt3.f cgemlqt.f | |||
| cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f | |||
| cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f | |||
| cgelq.f claswlq.f clamswlq.f cgemlq.f | |||
| ctplqt.f ctplqt2.f ctpmlqt.f | |||
| chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f | |||
| cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f | |||
| chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f | |||
| cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f | |||
| cungtsqr.f cunhr_col.f ) | |||
| cungtsqr.f cungtsqr_row.f cunhr_col.f ) | |||
| set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f | |||
| cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f | |||
| @@ -253,7 +253,7 @@ set(DLASRC | |||
| dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f | |||
| dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f | |||
| dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f | |||
| dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f | |||
| dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f | |||
| dlargv.f dlarrv.f dlartv.f | |||
| dlarz.f dlarzb.f dlarzt.f dlasy2.f | |||
| dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f | |||
| @@ -300,14 +300,14 @@ set(DLASRC | |||
| dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f | |||
| dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f | |||
| dgelqt.f dgelqt3.f dgemlqt.f | |||
| dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f | |||
| dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f | |||
| dgelq.f dlaswlq.f dlamswlq.f dgemlq.f | |||
| dtplqt.f dtplqt2.f dtpmlqt.f | |||
| dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f | |||
| dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f | |||
| dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f | |||
| dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f | |||
| dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f ) | |||
| dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) | |||
| set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f | |||
| dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f | |||
| @@ -360,7 +360,7 @@ set(ZLASRC | |||
| zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f | |||
| zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f | |||
| zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f | |||
| zlarcm.f zlarf.f zlarfb.f | |||
| zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f | |||
| zlarfg.f zlarfgp.f zlarft.f | |||
| zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f | |||
| zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f | |||
| @@ -402,13 +402,13 @@ set(ZLASRC | |||
| ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f | |||
| ztplqt.f ztplqt2.f ztpmlqt.f | |||
| zgelqt.f zgelqt3.f zgemlqt.f | |||
| zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f | |||
| zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f | |||
| zgelq.f zlaswlq.f zlamswlq.f zgemlq.f | |||
| zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f | |||
| zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f | |||
| zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f | |||
| zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f | |||
| zungtsqr.f zunhr_col.f) | |||
| zungtsqr.f zungtsqr_row.f zunhr_col.f) | |||
| set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f | |||
| zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f | |||
| @@ -114,6 +114,8 @@ set(CSRC | |||
| lapacke_cgetrs_work.c | |||
| lapacke_cgetsls.c | |||
| lapacke_cgetsls_work.c | |||
| lapacke_cgetsqrhrt.c | |||
| lapacke_cgetsqrhrt_work.c | |||
| lapacke_cggbak.c | |||
| lapacke_cggbak_work.c | |||
| lapacke_cggbal.c | |||
| @@ -590,6 +592,8 @@ set(CSRC | |||
| lapacke_cungrq_work.c | |||
| lapacke_cungtr.c | |||
| lapacke_cungtr_work.c | |||
| lapacke_cungtsqr_row.c | |||
| lapacke_cungtsqr_row_work.c | |||
| lapacke_cunmbr.c | |||
| lapacke_cunmbr_work.c | |||
| lapacke_cunmhr.c | |||
| @@ -735,6 +739,8 @@ set(DSRC | |||
| lapacke_dgetrs_work.c | |||
| lapacke_dgetsls.c | |||
| lapacke_dgetsls_work.c | |||
| lapacke_dgetsqrhrt.c | |||
| lapacke_dgetsqrhrt_work.c | |||
| lapacke_dggbak.c | |||
| lapacke_dggbak_work.c | |||
| lapacke_dggbal.c | |||
| @@ -862,6 +868,8 @@ set(DSRC | |||
| lapacke_dorgrq_work.c | |||
| lapacke_dorgtr.c | |||
| lapacke_dorgtr_work.c | |||
| lapacke_dorgtsqr_row.c | |||
| lapacke_dorgtsqr_row_work.c | |||
| lapacke_dormbr.c | |||
| lapacke_dormbr_work.c | |||
| lapacke_dormhr.c | |||
| @@ -1309,6 +1317,8 @@ set(SSRC | |||
| lapacke_sgetrs_work.c | |||
| lapacke_sgetsls.c | |||
| lapacke_sgetsls_work.c | |||
| lapacke_sgetsqrhrt.c | |||
| lapacke_sgetsqrhrt_work.c | |||
| lapacke_sggbak.c | |||
| lapacke_sggbak_work.c | |||
| lapacke_sggbal.c | |||
| @@ -1435,6 +1445,8 @@ set(SSRC | |||
| lapacke_sorgrq_work.c | |||
| lapacke_sorgtr.c | |||
| lapacke_sorgtr_work.c | |||
| lapacke_sorgtsqr_row.c | |||
| lapacke_sorgtsqr_row_work.c | |||
| lapacke_sormbr.c | |||
| lapacke_sormbr_work.c | |||
| lapacke_sormhr.c | |||
| @@ -1877,6 +1889,8 @@ set(ZSRC | |||
| lapacke_zgetrs_work.c | |||
| lapacke_zgetsls.c | |||
| lapacke_zgetsls_work.c | |||
| lapacke_zgetsqrhrt.c | |||
| lapacke_zgetsqrhrt_work.c | |||
| lapacke_zggbak.c | |||
| lapacke_zggbak_work.c | |||
| lapacke_zggbal.c | |||
| @@ -2351,6 +2365,8 @@ set(ZSRC | |||
| lapacke_zungrq_work.c | |||
| lapacke_zungtr.c | |||
| lapacke_zungtr_work.c | |||
| lapacke_zungtsqr_row.c | |||
| lapacke_zungtsqr_row_work.c | |||
| lapacke_zunmbr.c | |||
| lapacke_zunmbr_work.c | |||
| lapacke_zunmhr.c | |||
| @@ -2499,6 +2515,5 @@ foreach (Utils_FILE ${Utils_SRC}) | |||
| endforeach () | |||
| set(lapacke_include_dir "${NETLIB_LAPACK_DIR}/LAPACKE/include") | |||
| configure_file("${lapacke_include_dir}/lapacke_mangling_with_flags.h.in" "${lapacke_include_dir}/lapacke_mangling.h" COPYONLY) | |||
| include_directories(${lapacke_include_dir}) | |||
| set_source_files_properties(${LAPACKE_SOURCES} PROPERTIES COMPILE_FLAGS "${LAPACK_CFLAGS}") | |||
| @@ -127,6 +127,10 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| "#define DLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define CLOCAL_BUFFER_SIZE\t16384\n" | |||
| "#define ZLOCAL_BUFFER_SIZE\t16384\n") | |||
| set(HAVE_SSE 1) | |||
| set(HAVE_SSE2 1) | |||
| set(HAVE_SSE3 1) | |||
| set(HAVE_SSSE3 1) | |||
| set(SGEMM_UNROLL_M 8) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 4) | |||
| @@ -177,7 +181,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53") | |||
| elseif ("${TCORE}" STREQUAL "CORTEXA57" OR "${TCORE}" STREQUAL "CORTEXA53" OR "${TCORE}" STREQUAL "CORTEXA55") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -237,6 +241,61 @@ endif () | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN1") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEV1") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| "#define L1_CODE_ASSOCIATIVE\t4\n" | |||
| "#define L1_DATA_SIZE\t65536\n" | |||
| "#define L1_DATA_LINESIZE\t64\n" | |||
| "#define L1_DATA_ASSOCIATIVE\t4\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| set(DGEMM_UNROLL_M 8) | |||
| set(DGEMM_UNROLL_N 4) | |||
| set(CGEMM_UNROLL_M 8) | |||
| set(CGEMM_UNROLL_N 4) | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "NEOVERSEN2") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_CODE_SIZE\t65536\n" | |||
| "#define L1_CODE_LINESIZE\t64\n" | |||
| @@ -246,13 +305,14 @@ endif () | |||
| "#define L1_DATA_ASSOCIATIVE\t2\n" | |||
| "#define L2_SIZE\t1048576\n\n" | |||
| "#define L2_LINESIZE\t64\n" | |||
| "#define L2_ASSOCIATIVE\t16\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t64\n" | |||
| "#define L2_ASSOCIATIVE\t8\n" | |||
| "#define DTB_DEFAULT_ENTRIES\t48\n" | |||
| "#define DTB_SIZE\t4096\n" | |||
| "#define HAVE_VFPV4\n" | |||
| "#define HAVE_VFPV3\n" | |||
| "#define HAVE_VFP\n" | |||
| "#define HAVE_NEON\n" | |||
| "#define HAVE_SVE\n" | |||
| "#define ARMV8\n") | |||
| set(SGEMM_UNROLL_M 16) | |||
| set(SGEMM_UNROLL_N 4) | |||
| @@ -416,7 +476,7 @@ endif () | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define ARMV8\n" | |||
| "#define L1_CODE_SIZE\t32768\n" | |||
| @@ -439,6 +499,34 @@ elseif ("${TCORE}" STREQUAL "VORTEX") | |||
| set(ZGEMM_UNROLL_M 4) | |||
| set(ZGEMM_UNROLL_N 4) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "P5600") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 1048576\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" MATCHES "MIPS") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L2_SIZE 262144\n" | |||
| "#define DTB_SIZE 4096\n" | |||
| "#define DTB_DEFAULT_ENTRIES 64\n") | |||
| set(SGEMM_UNROLL_M 2) | |||
| set(SGEMM_UNROLL_N 2) | |||
| set(DGEMM_UNROLL_M 2) | |||
| set(DGEMM_UNROLL_N 2) | |||
| set(CGEMM_UNROLL_M 2) | |||
| set(CGEMM_UNROLL_N 2) | |||
| set(ZGEMM_UNROLL_M 2) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 16) | |||
| elseif ("${TCORE}" STREQUAL "POWER6") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| @@ -33,15 +33,18 @@ endif () | |||
| if (DEFINED BINARY AND DEFINED TARGET AND BINARY EQUAL 32) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| set(NO_AVX 1) | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE") | |||
| if (${TARGET} STREQUAL "HASWELL" OR ${TARGET} STREQUAL "SANDYBRIDGE" OR ${TARGET} STREQUAL "SKYLAKEX" OR ${TARGET} STREQUAL "COOPERLAKE" OR ${TARGET} STREQUAL "SAPPHIRERAPIDS") | |||
| set(TARGET "NEHALEM") | |||
| endif () | |||
| if (${TARGET} STREQUAL "BULLDOZER" OR ${TARGET} STREQUAL "PILEDRIVER" OR ${TARGET} STREQUAL "ZEN") | |||
| set(TARGET "BARCELONA") | |||
| endif () | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53") | |||
| if (${TARGET} STREQUAL "ARMV8" OR ${TARGET} STREQUAL "CORTEXA57" OR ${TARGET} STREQUAL "CORTEXA53" OR ${TARGET} STREQUAL "CORTEXA55") | |||
| set(TARGET "ARMV7") | |||
| endif () | |||
| if (${TARGET} STREQUAL "POWER8" OR ${TARGET} STREQUAL "POWER9" OR ${TARGET} STREQUAL "POWER10") | |||
| set(TARGET "POWER6") | |||
| endif () | |||
| endif () | |||
| @@ -102,6 +105,18 @@ if (CMAKE_C_COMPILER STREQUAL loongcc) | |||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -static") | |||
| endif () | |||
| if (POWER) | |||
| set(NO_WARMUP 1) | |||
| set(HAVE_GAS 1) | |||
| if (CMAKE_ASM_COMPILER_ID STREQUAL "GNU") | |||
| set(HAVE_GAS 0) | |||
| elseif (CMAKE_ASM_COMPILER_ID STREQUAL "Clang") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -fno-integrated-as") | |||
| set(HAVE_GAS 0) | |||
| endif () | |||
| set(GETARCH_FLAGS "${GETARCH_FLAGS} -DHAVE_GAS=${HAVE_GAS}") | |||
| endif () | |||
| #if don't use Fortran, it will only compile CBLAS. | |||
| if (ONLY_CBLAS) | |||
| set(NO_LAPACK 1) | |||
| @@ -148,16 +163,36 @@ endif () | |||
| include("${PROJECT_SOURCE_DIR}/cmake/prebuild.cmake") | |||
| if (DEFINED TARGET) | |||
| if (${TARGET} STREQUAL COOPERLAKE AND NOT NO_AVX512) | |||
| # if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.1 OR ${GCC_VERSION} VERSION_EQUAL 10.1) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 10.09) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| # elseif (${CMAKE_C_COMPILER_ID} STREQUAL "CLANG") | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| # endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 8.99) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=cooperlake") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SAPPHIRERAPIDS AND NOT NO_AVX512) | |||
| if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU") | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 11.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| elseif (${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") | |||
| if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 12.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=sapphirerapids") | |||
| else() | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| endif() | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL SKYLAKEX AND NOT NO_AVX512) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -march=skylake-avx512") | |||
| @@ -182,11 +217,11 @@ if (DEFINED TARGET) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mavx2") | |||
| endif() | |||
| endif() | |||
| if (DEFINED HAVE_FMA3) | |||
| if (NOT NO_AVX2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") | |||
| endif() | |||
| endif() | |||
| # if (DEFINED HAVE_FMA3) | |||
| # if (NOT NO_AVX2) | |||
| # set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mfma") | |||
| # endif() | |||
| # endif() | |||
| if (DEFINED HAVE_SSE) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse") | |||
| endif() | |||
| @@ -202,6 +237,27 @@ if (DEFINED TARGET) | |||
| if (DEFINED HAVE_SSE4_1) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -msse4.1") | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER10) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 10.2 OR ${GCC_VERSION} VERSION_EQUAL 10.2) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power10 -mtune=power10 -mvsx -fno-fast-math") | |||
| else () | |||
| message(FATAL_ERROR "Compiler GCC.${GCC_VERSION} does not support Power10.") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER9) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpversion OUTPUT_VARIABLE GCC_VERSION) | |||
| if (${GCC_VERSION} VERSION_GREATER 5.0 OR ${GCC_VERSION} VERSION_EQUAL 5.0) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power9 -mtune=power9 -mvsx -fno-fast-math") | |||
| else () | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| message(WARNING "Compiler GCC.${GCC_VERSION} does not support fully Power9.") | |||
| endif() | |||
| endif() | |||
| if (${TARGET} STREQUAL POWER8) | |||
| set (KERNEL_DEFINITIONS "${KERNEL_DEFINITIONS} -mcpu=power8 -mtune=power8 -mvsx -fno-fast-math") | |||
| endif() | |||
| endif() | |||
| if (DEFINED BINARY) | |||
| message(STATUS "Compiling a ${BINARY}-bit binary.") | |||
| @@ -219,6 +275,11 @@ include("${PROJECT_SOURCE_DIR}/cmake/arch.cmake") | |||
| # C Compiler dependent settings | |||
| include("${PROJECT_SOURCE_DIR}/cmake/cc.cmake") | |||
| if (INTERFACE64) | |||
| set(SUFFIX64 64) | |||
| set(SUFFIX64_UNDERSCORE _64) | |||
| endif() | |||
| if (NOT NOFORTRAN) | |||
| # Fortran Compiler dependent settings | |||
| include("${PROJECT_SOURCE_DIR}/cmake/fc.cmake") | |||
| @@ -233,6 +294,11 @@ if (BINARY64) | |||
| endif () | |||
| endif () | |||
| if(EMBEDDED) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DOS_EMBEDDED") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -mthumb -mcpu=cortex-m4 -mfloat-abi=hard -mfpu=fpv4-sp-d16") | |||
| endif() | |||
| if (NEED_PIC) | |||
| if (${CMAKE_C_COMPILER} STREQUAL "IBM") | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -qpic=large") | |||
| @@ -249,8 +315,15 @@ if (NEED_PIC) | |||
| endif() | |||
| endif () | |||
| if (X86_64 OR ${CORE} STREQUAL POWER10) | |||
| set(SMALL_MATRIX_OPT TRUE) | |||
| endif () | |||
| if (SMALL_MATRIX_OPT) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DSMALL_MATRIX_OPT") | |||
| endif () | |||
| if (DYNAMIC_ARCH) | |||
| if (X86 OR X86_64 OR ARM64 OR PPC) | |||
| if (X86 OR X86_64 OR ARM64 OR POWER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") | |||
| if (DYNAMIC_OLDER) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") | |||
| @@ -290,6 +363,10 @@ if (NO_AVX2) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") | |||
| endif () | |||
| if (NO_AVX512) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") | |||
| endif () | |||
| if (USE_THREAD) | |||
| # USE_SIMPLE_THREADED_LEVEL3 = 1 | |||
| # NO_AFFINITY = 1 | |||
| @@ -449,6 +526,9 @@ endif() | |||
| if (BUILD_COMPLEX16) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16") | |||
| endif() | |||
| if (BUILD_BFLOAT16) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_BFLOAT16") | |||
| endif() | |||
| if(NOT MSVC) | |||
| set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") | |||
| endif() | |||
| @@ -20,11 +20,11 @@ endif() | |||
| if(CMAKE_COMPILER_IS_GNUCC AND WIN32) | |||
| if(MINGW) | |||
| execute_process(COMMAND ${CMAKE_C_COMPILER} -dumpmachine | |||
| OUTPUT_VARIABLE OPENBLAS_GCC_TARGET_MACHINE | |||
| OUTPUT_VARIABLE OPENBLAS_MINGW_TARGET_MACHINE | |||
| OUTPUT_STRIP_TRAILING_WHITESPACE) | |||
| if(OPENBLAS_GCC_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
| if(OPENBLAS_MINGW_TARGET_MACHINE MATCHES "amd64|x86_64|AMD64") | |||
| set(MINGW64 1) | |||
| endif() | |||
| endif() | |||
| @@ -35,9 +35,11 @@ if(CMAKE_CL_64 OR MINGW64) | |||
| elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING)) | |||
| set(X86 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*") | |||
| set(PPC 1) | |||
| set(POWER 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*") | |||
| set(MIPS64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "loongarch64.*") | |||
| set(LOONGARCH64 1) | |||
| elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "amd64.*|x86_64.*|AMD64.*") | |||
| if (NOT BINARY) | |||
| if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8") | |||
| @@ -71,6 +73,8 @@ elseif (${CMAKE_CROSSCOMPILING}) | |||
| else () | |||
| set(X86 1) | |||
| endif() | |||
| elseif (${TARGET} STREQUAL "P5600" OR ${TARGET} MATCHES "MIPS.*") | |||
| set(MIPS32 1) | |||
| elseif (${TARGET} STREQUAL "ARMV7") | |||
| set(ARM 1) | |||
| else() | |||
| @@ -84,8 +88,12 @@ if (X86_64) | |||
| set(ARCH "x86_64") | |||
| elseif(X86) | |||
| set(ARCH "x86") | |||
| elseif(PPC) | |||
| elseif(POWER) | |||
| set(ARCH "power") | |||
| elseif(MIPS32) | |||
| set(ARCH "mips") | |||
| elseif(MIPS64) | |||
| set(ARCH "mips64") | |||
| elseif(ARM) | |||
| set(ARCH "arm") | |||
| elseif(ARM64) | |||
| @@ -95,7 +103,7 @@ else() | |||
| endif () | |||
| if (NOT BINARY) | |||
| if (X86_64 OR ARM64 OR PPC OR MIPS64) | |||
| if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64) | |||
| set(BINARY 64) | |||
| else () | |||
| set(BINARY 32) | |||
| @@ -15,35 +15,83 @@ endfunction () | |||
| # Reads a Makefile into CMake vars. | |||
| macro(ParseMakefileVars MAKEFILE_IN) | |||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| set (C_COMPILER ${CMAKE_C_COMPILER_ID}) | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| set (SkipIfs 0) | |||
| set (SkipElse 0) | |||
| file(STRINGS ${MAKEFILE_IN} makefile_contents) | |||
| foreach (makefile_line ${makefile_contents}) | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| # Skip the entire scope of the else statement given that the if statement that precedes it has the valid condition. | |||
| # The variable SkipIfs is used to identify which endif statement closes the scope of the else statement. | |||
| if (${SkipElse} EQUAL 1) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
| endif () | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if (${SkipIfs} EQUAL 0) | |||
| set (SkipElse 0) | |||
| else () | |||
| MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
| endif () | |||
| endif () | |||
| continue () | |||
| endif () | |||
| # The variable IfElse is greater than 0 if and only if the previously parsed line is an if statement. | |||
| if (${IfElse} GREATER 0) | |||
| # If the current scope is the one that has to be skipped, the if/endif/else statements | |||
| # along with it till the endif that closes the current scope have to be ignored as well. | |||
| string(REGEX MATCH "(ifeq|ifneq|ifdef|ifndef) .*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| MATH(EXPR SkipIfs "${SkipIfs}+1") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| if (${SkipIfs} EQUAL 0) | |||
| #message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| else () | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| MATH(EXPR SkipIfs "${SkipIfs}-1") | |||
| endif () | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| continue () | |||
| endif() | |||
| if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| # message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| if (${SkipIfs} EQUAL 0) | |||
| #message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| else () | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| endif () | |||
| continue () | |||
| endif() | |||
| # Skip the lines that are not part of the path that has to be taken. | |||
| if ((${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR (${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1) OR (${SkipIfs} GREATER 0)) | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| # Skip commented lines (the ones that start with '#') | |||
| string(REGEX MATCH "[ \t]*\\#.*$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on ${line_match}") | |||
| #message(STATUS "match on ${line_match}") | |||
| set(var_name ${CMAKE_MATCH_1}) | |||
| # set(var_value ${CMAKE_MATCH_2}) | |||
| #set(var_value ${CMAKE_MATCH_2}) | |||
| string(STRIP ${CMAKE_MATCH_2} var_value) | |||
| # check for Makefile variables in the string, e.g. $(TSUFFIX) | |||
| string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | |||
| @@ -54,36 +102,93 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| string(REPLACE "$(${make_var})" "${${make_var}}" var_value ${var_value}) | |||
| endforeach () | |||
| set(${var_name} ${var_value}) | |||
| else () | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| continue () | |||
| endif () | |||
| # Include a new file to be parsed | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| continue () | |||
| endif () | |||
| # The if statement that precedes this else has the path taken | |||
| # Thus, this else statement has to be skipped. | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "skipping ${makefile_line}") | |||
| set (SkipElse 1) | |||
| continue() | |||
| endif() | |||
| # Example 1: ifdef HAVE_MSA | |||
| # Example 2: ifndef ZNRM2KERNEL | |||
| string(REGEX MATCH "(ifdef|ifndef) ([0-9_A-Z]+)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "${CMAKE_MATCH_1} first: ${CMAKE_MATCH_2}") | |||
| set (ElseSeen 0) | |||
| if (${CMAKE_MATCH_2}) | |||
| if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| else () | |||
| # message(STATUS "unmatched line ${line_match}") | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| if (${CMAKE_MATCH_1} STREQUAL "ifdef") | |||
| set (IfElse 2) | |||
| else () | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| endif () | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| endif () | |||
| endif () | |||
| continue () | |||
| endif () | |||
| # Example 1: ifeq ($(SGEMM_UNROLL_M), 16) | |||
| # Example 2: ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
| # Example 3: ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) | |||
| # Ignore the second group since (?:...) does not work on cmake | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([0-9_A-Z]+)\\)(([0-9_A-Za-z]*)\\$\\(([0-9_A-Z]+)\\))?,[ \t]*([0-9_A-Za-z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4} fourth: ${CMAKE_MATCH_5}") | |||
| if (DEFINED ${CMAKE_MATCH_1}) | |||
| if (DEFINED ${CMAKE_MATCH_4}) | |||
| set (STR ${${CMAKE_MATCH_1}}${CMAKE_MATCH_3}${${CMAKE_MATCH_4}}) | |||
| else () | |||
| set (STR ${${CMAKE_MATCH_1}}) | |||
| endif () | |||
| if (${STR} STREQUAL ${CMAKE_MATCH_5}) | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| continue () | |||
| endif () | |||
| endif () | |||
| set (IfElse 2) | |||
| continue () | |||
| endif () | |||
| # Example 1 (Group 3): ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| # Example 2 (Group 4): ifneq ($(C_COMPILER), PGI) | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([0-9_A-Z]+)\\),[ \t]*(\\$\\(([0-9_A-Z]+)\\)|([0-9_A-Z]+))\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_3} third: ${CMAKE_MATCH_4}") | |||
| set (ElseSeen 0) | |||
| set (HasValidGroup 0) | |||
| if (DEFINED ${CMAKE_MATCH_3}) | |||
| set (HasValidGroup 1) | |||
| set (STR ${${CMAKE_MATCH_3}}) | |||
| elseif (NOT ${CMAKE_MATCH_4} STREQUAL "") | |||
| set (HasValidGroup 1) | |||
| set (STR ${CMAKE_MATCH_4}) | |||
| endif () | |||
| if (DEFINED ${CMAKE_MATCH_1} AND ${HasValidGroup} EQUAL 1) | |||
| if (NOT (${${CMAKE_MATCH_1}} STREQUAL ${STR})) | |||
| #message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| continue () | |||
| endif () | |||
| endif () | |||
| set (IfElse 2) | |||
| continue () | |||
| endif () | |||
| #message(STATUS "unmatched line ${line_match}") | |||
| endforeach () | |||
| endmacro () | |||
| @@ -154,31 +259,31 @@ endfunction () | |||
| # STRING - compiles only the given type (e.g. DOUBLE) | |||
| function(GenerateNamedObjects sources_in) | |||
| if (DEFINED ARGV1) | |||
| if (${ARGC} GREATER 1) | |||
| set(defines_in ${ARGV1}) | |||
| endif () | |||
| if (DEFINED ARGV2 AND NOT "${ARGV2}" STREQUAL "") | |||
| if (${ARGC} GREATER 2 AND NOT "${ARGV2}" STREQUAL "") | |||
| set(name_in ${ARGV2}) | |||
| # strip off extension for kernel files that pass in the object name. | |||
| get_filename_component(name_in ${name_in} NAME_WE) | |||
| endif () | |||
| if (DEFINED ARGV3) | |||
| if (${ARGC} GREATER 3) | |||
| set(use_cblas ${ARGV3}) | |||
| else () | |||
| set(use_cblas false) | |||
| endif () | |||
| if (DEFINED ARGV4) | |||
| if (${ARGC} GREATER 4) | |||
| set(replace_last_with ${ARGV4}) | |||
| endif () | |||
| if (DEFINED ARGV5) | |||
| if (${ARGC} GREATER 5) | |||
| set(append_with ${ARGV5}) | |||
| endif () | |||
| if (DEFINED ARGV6) | |||
| if (${ARGC} GREATER 6) | |||
| set(no_float_type ${ARGV6}) | |||
| else () | |||
| set(no_float_type false) | |||
| @@ -193,7 +298,7 @@ function(GenerateNamedObjects sources_in) | |||
| set(real_only false) | |||
| set(complex_only false) | |||
| set(mangle_complex_sources false) | |||
| if (DEFINED ARGV7 AND NOT "${ARGV7}" STREQUAL "") | |||
| if (${ARGC} GREATER 7 AND NOT "${ARGV7}" STREQUAL "") | |||
| if (${ARGV7} EQUAL 1) | |||
| set(real_only true) | |||
| elseif (${ARGV7} EQUAL 2) | |||
| @@ -251,6 +356,19 @@ function(GenerateNamedObjects sources_in) | |||
| # now add the object and set the defines | |||
| set(obj_defines ${defines_in}) | |||
| list(FIND obj_defines "RC" def_idx) | |||
| if (${def_idx} GREATER -1) | |||
| # list(REMOVE_AT ${obj_defines} ${def_idx}) | |||
| list (REMOVE_ITEM obj_defines "RC") | |||
| list(APPEND obj_defines "RC=RC") | |||
| endif () | |||
| list(FIND obj_defines "CR" def_idx) | |||
| if (${def_idx} GREATER -1) | |||
| # list(REMOVE_AT ${obj_defines} ${def_idx}) | |||
| list (REMOVE_ITEM obj_defines "CR") | |||
| list(APPEND obj_defines "CR=CR") | |||
| endif () | |||
| if (use_cblas) | |||
| set(obj_name "cblas_${obj_name}") | |||
| list(APPEND obj_defines "CBLAS") | |||
| @@ -295,7 +413,15 @@ function(GenerateNamedObjects sources_in) | |||
| configure_file(${new_source_file}.tmp ${new_source_file} COPYONLY) | |||
| file(REMOVE ${new_source_file}.tmp) | |||
| list(APPEND SRC_LIST_OUT ${new_source_file}) | |||
| message (STATUS ${new_source_file}) | |||
| if (DEFINED HAVE_FMA3) | |||
| if ( ${new_source_file} MATCHES "(s|d?)rot_k.*c") | |||
| set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") | |||
| endif () | |||
| if ( ${new_source_file} MATCHES "dgemv_t_k.*c") | |||
| set_source_files_properties(${new_source_file} PROPERTIES COMPILE_OPTIONS "-mfma") | |||
| endif () | |||
| endif () | |||
| endforeach () | |||
| endforeach () | |||
| @@ -318,17 +444,17 @@ endfunction () | |||
| function(GenerateCombinationObjects sources_in defines_in absent_codes_in all_defines_in replace_scheme) | |||
| set(alternate_name_in "") | |||
| if (DEFINED ARGV5) | |||
| if (${ARGC} GREATER 5) | |||
| set(alternate_name_in ${ARGV5}) | |||
| endif () | |||
| set(no_float_type false) | |||
| if (DEFINED ARGV6) | |||
| if (${ARGC} GREATER 6) | |||
| set(no_float_type ${ARGV6}) | |||
| endif () | |||
| set(complex_filename_scheme "") | |||
| if (DEFINED ARGV7) | |||
| if (${ARGC} GREATER 7) | |||
| set(complex_filename_scheme ${ARGV7}) | |||
| endif () | |||
| @@ -122,7 +122,7 @@ extern "C" { | |||
| #define ATOM GOTO_ATOM | |||
| #undef GOTO_ATOM | |||
| #endif | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -134,6 +134,9 @@ extern "C" { | |||
| #if defined(SMP) || defined(USE_LOCKING) | |||
| #include <pthread.h> | |||
| #endif | |||
| #else | |||
| #include <time.h> | |||
| #include <math.h> | |||
| #endif | |||
| #if defined(OS_SUNOS) | |||
| @@ -413,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_alpha.h" | |||
| #endif | |||
| #if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) | |||
| #if __has_include(<cet.h>) | |||
| #include <cet.h> | |||
| #endif | |||
| #endif | |||
| #ifndef _CET_ENDBR | |||
| #define _CET_ENDBR | |||
| #endif | |||
| #ifdef ARCH_X86 | |||
| #include "common_x86.h" | |||
| #endif | |||
| @@ -437,7 +449,7 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_mips.h" | |||
| #endif | |||
| #ifdef ARCH_RISCV64 | |||
| #include "common_riscv64.h" | |||
| #endif | |||
| @@ -458,6 +470,14 @@ please https://github.com/xianyi/OpenBLAS/issues/246 | |||
| #include "common_zarch.h" | |||
| #endif | |||
| #ifdef ARCH_LOONGARCH64 | |||
| #include "common_loongarch64.h" | |||
| #endif | |||
| #ifdef ARCH_E2K | |||
| #include "common_e2k.h" | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #ifdef OS_WINDOWSSTORE | |||
| typedef char env_var_t[MAX_PATH]; | |||
| @@ -488,10 +508,12 @@ static inline unsigned long long rpcc(void){ | |||
| struct timespec ts; | |||
| clock_gettime(CLOCK_MONOTONIC, &ts); | |||
| return (unsigned long long)ts.tv_sec * 1000000000ull + ts.tv_nsec; | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| struct timeval tv; | |||
| gettimeofday(&tv,NULL); | |||
| return (unsigned long long)tv.tv_sec * 1000000000ull + tv.tv_usec * 1000; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| #define RPCC_DEFINED | |||
| @@ -521,6 +543,10 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #include "common_linux.h" | |||
| #endif | |||
| #ifdef OS_EMBEDDED | |||
| #define DTB_DEFAULT_ENTRIES 64 | |||
| #endif | |||
| #define MMAP_ACCESS (PROT_READ | PROT_WRITE) | |||
| #ifdef __NetBSD__ | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define INLINE inline | |||
| #ifdef F_INTERFACE_FLANG | |||
| #if defined( F_INTERFACE_FLANG) || defined(F_INTERFACE_PGI) | |||
| #define RETURN_BY_STACK | |||
| #else | |||
| #define RETURN_BY_COMPLEX | |||
| @@ -120,7 +120,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| .text ; | |||
| .p2align 2 ; | |||
| .global REALNAME ; | |||
| #ifndef __APPLE__ | |||
| #if !defined(__APPLE__) && !defined(_WIN32) | |||
| .type REALNAME, %function ; | |||
| #endif | |||
| REALNAME: | |||
| @@ -232,6 +232,8 @@ | |||
| #define CGEADD_K cgeadd_k | |||
| #define CGEMM_SMALL_MATRIX_PERMIT cgemm_small_matrix_permit | |||
| #else | |||
| #define CAMAX_K gotoblas -> camax_k | |||
| @@ -426,8 +428,51 @@ | |||
| #define CGEADD_K gotoblas -> cgeadd_k | |||
| #define CGEMM_SMALL_MATRIX_PERMIT gotoblas -> cgemm_small_matrix_permit | |||
| #endif | |||
| #define CGEMM_SMALL_KERNEL_NN FUNC_OFFSET(cgemm_small_kernel_nn) | |||
| #define CGEMM_SMALL_KERNEL_NT FUNC_OFFSET(cgemm_small_kernel_nt) | |||
| #define CGEMM_SMALL_KERNEL_NR FUNC_OFFSET(cgemm_small_kernel_nr) | |||
| #define CGEMM_SMALL_KERNEL_NC FUNC_OFFSET(cgemm_small_kernel_nc) | |||
| #define CGEMM_SMALL_KERNEL_TN FUNC_OFFSET(cgemm_small_kernel_tn) | |||
| #define CGEMM_SMALL_KERNEL_TT FUNC_OFFSET(cgemm_small_kernel_tt) | |||
| #define CGEMM_SMALL_KERNEL_TR FUNC_OFFSET(cgemm_small_kernel_tr) | |||
| #define CGEMM_SMALL_KERNEL_TC FUNC_OFFSET(cgemm_small_kernel_tc) | |||
| #define CGEMM_SMALL_KERNEL_RN FUNC_OFFSET(cgemm_small_kernel_rn) | |||
| #define CGEMM_SMALL_KERNEL_RT FUNC_OFFSET(cgemm_small_kernel_rt) | |||
| #define CGEMM_SMALL_KERNEL_RR FUNC_OFFSET(cgemm_small_kernel_rr) | |||
| #define CGEMM_SMALL_KERNEL_RC FUNC_OFFSET(cgemm_small_kernel_rc) | |||
| #define CGEMM_SMALL_KERNEL_CN FUNC_OFFSET(cgemm_small_kernel_cn) | |||
| #define CGEMM_SMALL_KERNEL_CT FUNC_OFFSET(cgemm_small_kernel_ct) | |||
| #define CGEMM_SMALL_KERNEL_CR FUNC_OFFSET(cgemm_small_kernel_cr) | |||
| #define CGEMM_SMALL_KERNEL_CC FUNC_OFFSET(cgemm_small_kernel_cc) | |||
| #define CGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(cgemm_small_kernel_b0_nn) | |||
| #define CGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(cgemm_small_kernel_b0_nt) | |||
| #define CGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(cgemm_small_kernel_b0_nr) | |||
| #define CGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(cgemm_small_kernel_b0_nc) | |||
| #define CGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(cgemm_small_kernel_b0_tn) | |||
| #define CGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(cgemm_small_kernel_b0_tt) | |||
| #define CGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(cgemm_small_kernel_b0_tr) | |||
| #define CGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(cgemm_small_kernel_b0_tc) | |||
| #define CGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(cgemm_small_kernel_b0_rn) | |||
| #define CGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(cgemm_small_kernel_b0_rt) | |||
| #define CGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(cgemm_small_kernel_b0_rr) | |||
| #define CGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(cgemm_small_kernel_b0_rc) | |||
| #define CGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(cgemm_small_kernel_b0_cn) | |||
| #define CGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(cgemm_small_kernel_b0_ct) | |||
| #define CGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(cgemm_small_kernel_b0_cr) | |||
| #define CGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(cgemm_small_kernel_b0_cc) | |||
| #define CGEMM_NN cgemm_nn | |||
| #define CGEMM_CN cgemm_cn | |||
| #define CGEMM_TN cgemm_tn | |||
| @@ -157,6 +157,8 @@ | |||
| #define DIMATCOPY_K_RT dimatcopy_k_rt | |||
| #define DGEADD_K dgeadd_k | |||
| #define DGEMM_SMALL_MATRIX_PERMIT dgemm_small_matrix_permit | |||
| #else | |||
| #define DAMAX_K gotoblas -> damax_k | |||
| @@ -281,8 +283,21 @@ | |||
| #define DGEADD_K gotoblas -> dgeadd_k | |||
| #define DGEMM_SMALL_MATRIX_PERMIT gotoblas -> dgemm_small_matrix_permit | |||
| #endif | |||
| #define DGEMM_SMALL_KERNEL_NN FUNC_OFFSET(dgemm_small_kernel_nn) | |||
| #define DGEMM_SMALL_KERNEL_NT FUNC_OFFSET(dgemm_small_kernel_nt) | |||
| #define DGEMM_SMALL_KERNEL_TN FUNC_OFFSET(dgemm_small_kernel_tn) | |||
| #define DGEMM_SMALL_KERNEL_TT FUNC_OFFSET(dgemm_small_kernel_tt) | |||
| #define DGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(dgemm_small_kernel_b0_nn) | |||
| #define DGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(dgemm_small_kernel_b0_nt) | |||
| #define DGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(dgemm_small_kernel_b0_tn) | |||
| #define DGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(dgemm_small_kernel_b0_tt) | |||
| #define DGEMM_NN dgemm_nn | |||
| #define DGEMM_CN dgemm_tn | |||
| #define DGEMM_TN dgemm_tn | |||
| @@ -0,0 +1,64 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #ifndef COMMON_E2K | |||
| #define COMMON_E2K | |||
| #ifdef ASSEMBLER | |||
| #error | |||
| #endif | |||
| #define MB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define WMB do { __asm__ __volatile__("": : :"memory"); } while (0) | |||
| #define RMB | |||
| #define INLINE __attribute__((__always_inline__)) inline | |||
| static inline int blas_quickdivide(blasint x, blasint y) { | |||
| return x / y; | |||
| } | |||
| #ifndef PAGESIZE | |||
| #define PAGESIZE ( 4 << 10) | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 2 << 20) | |||
| #ifndef BUFFERSIZE | |||
| #define BUFFER_SIZE (32 << 20) | |||
| #else | |||
| #define BUFFER_SIZE (32 << BUFFERSIZE) | |||
| #endif | |||
| #define SEEK_ADDRESS | |||
| #endif | |||
| @@ -709,6 +709,13 @@ int BLASFUNC(cpotrf)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotrf)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotrf)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotri)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotri)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotri)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(cpotri)(char *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(zpotri)(char *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(xpotri)(char *, blasint *, xdouble *, blasint *, blasint *); | |||
| int BLASFUNC(spotrs)(char *, blasint *, blasint *, float *, blasint *, float *, blasint *, blasint *); | |||
| int BLASFUNC(dpotrs)(char *, blasint *, blasint *, double *, blasint *, double *, blasint *, blasint *); | |||
| int BLASFUNC(qpotrs)(char *, blasint *, blasint *, xdouble *, blasint *, xdouble *, blasint *, blasint *); | |||
| @@ -515,6 +515,129 @@ int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble | |||
| int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); | |||
| #endif | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int sbgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||
| int sbgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||
| int sgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int dgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); | |||
| int dgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sbgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int sgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int dgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int cgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); | |||
| int cgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int zgemm_small_matrix_permit(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); | |||
| int zgemm_small_kernel_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int cgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_nn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_nt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_nr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_nc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_tn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_tt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_tr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_tc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_rn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_rt(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_rr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_rc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_cn(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_ct(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_cr(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int zgemm_small_kernel_b0_cc(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| #endif | |||
| int cgemm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int cgemm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| int cgemm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); | |||
| @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, | |||
| // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 | |||
| return 0; | |||
| #else | |||
| #if defined (LOONGSON3B) | |||
| #if defined (__64BIT__) | |||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
| #else | |||
| return 0; //NULL Implementation on Loongson 3B 32bit. | |||
| #endif | |||
| #else | |||
| //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 | |||
| // unsigned long null_nodemask=0; | |||
| return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); | |||
| #endif | |||
| #endif | |||
| } | |||
| static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { | |||
| @@ -0,0 +1,199 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| /* */ | |||
| /* Redistribution and use in source and binary forms, with or */ | |||
| /* without modification, are permitted provided that the following */ | |||
| /* conditions are met: */ | |||
| /* */ | |||
| /* 1. Redistributions of source code must retain the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer. */ | |||
| /* */ | |||
| /* 2. Redistributions in binary form must reproduce the above */ | |||
| /* copyright notice, this list of conditions and the following */ | |||
| /* disclaimer in the documentation and/or other materials */ | |||
| /* provided with the distribution. */ | |||
| /* */ | |||
| /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ | |||
| /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ | |||
| /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ | |||
| /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ | |||
| /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ | |||
| /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ | |||
| /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ | |||
| /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ | |||
| /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ | |||
| /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ | |||
| /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ | |||
| /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ | |||
| /* POSSIBILITY OF SUCH DAMAGE. */ | |||
| /* */ | |||
| /* The views and conclusions contained in the software and */ | |||
| /* documentation are those of the authors and should not be */ | |||
| /* interpreted as representing official policies, either expressed */ | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #ifndef COMMON_LOONGARCH64 | |||
| #define COMMON_LOONGARCH64 | |||
| #define MB __sync_synchronize() | |||
| #define WMB __sync_synchronize() | |||
| #define RMB __sync_synchronize() | |||
| #define INLINE inline | |||
| #ifndef ASSEMBLER | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| } | |||
| #ifdef DOUBLE | |||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory") | |||
| #else | |||
| #define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory") | |||
| #endif | |||
| #define GET_IMAGE_CANCEL | |||
| #else | |||
| #ifdef DOUBLE | |||
| #define LD fld.d | |||
| #define ST fst.d | |||
| #define MADD fmadd.d | |||
| #define NMADD fnmadd.d | |||
| #define MSUB fmsub.d | |||
| #define NMSUB fnmsub.d | |||
| #define ADD fadd.d | |||
| #define SUB fsub.d | |||
| #define MUL fmul.d | |||
| #define MOV fmov.d | |||
| #define CMOVT fsel | |||
| #define MTC movgr2fr.d | |||
| #define FABS fabs.d | |||
| #define CMPEQ fcmp.ceq.d | |||
| #define CMPLE fcmp.cle.d | |||
| #define CMPLT fcmp.clt.d | |||
| #define NEG fneg.d | |||
| #else | |||
| #define LD fld.s | |||
| #define ST fst.s | |||
| #define MADD fmadd.s | |||
| #define NMADD fnmadd.s | |||
| #define MSUB fmsub.s | |||
| #define NMSUB fnmsub.s | |||
| #define ADD fadd.s | |||
| #define SUB fsub.s | |||
| #define MUL fmul.s | |||
| #define MOV fmov.s | |||
| #define CMOVT fsel | |||
| #define MTC movgr2fr.w | |||
| #define FABS fabs.s | |||
| #define CMPEQ fcmp.ceq.s | |||
| #define CMPLE fcmp.cle.s | |||
| #define CMPLT fcmp.clt.s | |||
| #define NEG fneg.s | |||
| #endif /* defined(DOUBLE) */ | |||
| #if defined(__64BIT__) && defined(USE64BITINT) | |||
| #define LDINT ld.d | |||
| #define LDARG ld.d | |||
| #define SDARG st.d | |||
| #elif defined(__64BIT__) && !defined(USE64BITINT) | |||
| #define LDINT ld.w | |||
| #define LDARG ld.d | |||
| #define SDARG st.d | |||
| #else | |||
| #define LDINT ld.w | |||
| #define LDARG ld.w | |||
| #define SDARG st.w | |||
| #endif | |||
| #ifndef F_INTERFACE | |||
| #define REALNAME ASMNAME | |||
| #else | |||
| #define REALNAME ASMFNAME | |||
| #endif /* defined(F_INTERFACE) */ | |||
| #if defined(ASSEMBLER) && !defined(NEEDPARAM) | |||
| #define PROLOGUE \ | |||
| .text ;\ | |||
| .align 5 ;\ | |||
| .globl REALNAME ;\ | |||
| .type REALNAME, @function ;\ | |||
| REALNAME: ;\ | |||
| #if defined(__linux__) && defined(__ELF__) | |||
| #define GNUSTACK .section .note.GNU-stack,"",@progbits | |||
| #else | |||
| #define GNUSTACK | |||
| #endif /* defined(__linux__) && defined(__ELF__) */ | |||
| #define EPILOGUE \ | |||
| .end REALNAME ;\ | |||
| GNUSTACK | |||
| #define PROFCODE | |||
| #define MOVT(dst, src, cc) \ | |||
| bceqz cc, 1f; \ | |||
| add.d dst, src, $r0; \ | |||
| 1: | |||
| #endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */ | |||
| #endif /* defined(ASSEMBLER) */ | |||
| #define SEEK_ADDRESS | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define PAGESIZE (16UL << 10) | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #define HUGE_PAGESIZE ( 2 << 20) | |||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||
| #ifndef MAP_ANONYMOUS | |||
| #define MAP_ANONYMOUS MAP_ANON | |||
| #endif | |||
| #endif | |||
| @@ -644,6 +644,17 @@ | |||
| #define GEADD_K DGEADD_K | |||
| #define GEMM_SMALL_MATRIX_PERMIT DGEMM_SMALL_MATRIX_PERMIT | |||
| #define GEMM_SMALL_KERNEL_NN DGEMM_SMALL_KERNEL_NN | |||
| #define GEMM_SMALL_KERNEL_NT DGEMM_SMALL_KERNEL_NT | |||
| #define GEMM_SMALL_KERNEL_TN DGEMM_SMALL_KERNEL_TN | |||
| #define GEMM_SMALL_KERNEL_TT DGEMM_SMALL_KERNEL_TT | |||
| #define GEMM_SMALL_KERNEL_B0_NN DGEMM_SMALL_KERNEL_B0_NN | |||
| #define GEMM_SMALL_KERNEL_B0_NT DGEMM_SMALL_KERNEL_B0_NT | |||
| #define GEMM_SMALL_KERNEL_B0_TN DGEMM_SMALL_KERNEL_B0_TN | |||
| #define GEMM_SMALL_KERNEL_B0_TT DGEMM_SMALL_KERNEL_B0_TT | |||
| #elif defined(BFLOAT16) | |||
| #define D_TO_BF16_K SBDTOBF16_K | |||
| @@ -931,6 +942,18 @@ | |||
| #define GEADD_K SGEADD_K | |||
| #define GEMM_SMALL_MATRIX_PERMIT SBGEMM_SMALL_MATRIX_PERMIT | |||
| #define GEMM_SMALL_KERNEL_NN SBGEMM_SMALL_KERNEL_NN | |||
| #define GEMM_SMALL_KERNEL_NT SBGEMM_SMALL_KERNEL_NT | |||
| #define GEMM_SMALL_KERNEL_TN SBGEMM_SMALL_KERNEL_TN | |||
| #define GEMM_SMALL_KERNEL_TT SBGEMM_SMALL_KERNEL_TT | |||
| #define GEMM_SMALL_KERNEL_B0_NN SBGEMM_SMALL_KERNEL_B0_NN | |||
| #define GEMM_SMALL_KERNEL_B0_NT SBGEMM_SMALL_KERNEL_B0_NT | |||
| #define GEMM_SMALL_KERNEL_B0_TN SBGEMM_SMALL_KERNEL_B0_TN | |||
| #define GEMM_SMALL_KERNEL_B0_TT SBGEMM_SMALL_KERNEL_B0_TT | |||
| #endif | |||
| #else | |||
| @@ -1236,6 +1259,19 @@ | |||
| #define IMATCOPY_K_RT SIMATCOPY_K_RT | |||
| #define GEADD_K SGEADD_K | |||
| #define GEMM_SMALL_MATRIX_PERMIT SGEMM_SMALL_MATRIX_PERMIT | |||
| #define GEMM_SMALL_KERNEL_NN SGEMM_SMALL_KERNEL_NN | |||
| #define GEMM_SMALL_KERNEL_NT SGEMM_SMALL_KERNEL_NT | |||
| #define GEMM_SMALL_KERNEL_TN SGEMM_SMALL_KERNEL_TN | |||
| #define GEMM_SMALL_KERNEL_TT SGEMM_SMALL_KERNEL_TT | |||
| #define GEMM_SMALL_KERNEL_B0_NN SGEMM_SMALL_KERNEL_B0_NN | |||
| #define GEMM_SMALL_KERNEL_B0_NT SGEMM_SMALL_KERNEL_B0_NT | |||
| #define GEMM_SMALL_KERNEL_B0_TN SGEMM_SMALL_KERNEL_B0_TN | |||
| #define GEMM_SMALL_KERNEL_B0_TT SGEMM_SMALL_KERNEL_B0_TT | |||
| #endif | |||
| #else | |||
| #ifdef XDOUBLE | |||
| @@ -2063,6 +2099,48 @@ | |||
| #define GEADD_K ZGEADD_K | |||
| #define GEMM_SMALL_MATRIX_PERMIT ZGEMM_SMALL_MATRIX_PERMIT | |||
| #define GEMM_SMALL_KERNEL_NN ZGEMM_SMALL_KERNEL_NN | |||
| #define GEMM_SMALL_KERNEL_NT ZGEMM_SMALL_KERNEL_NT | |||
| #define GEMM_SMALL_KERNEL_NR ZGEMM_SMALL_KERNEL_NR | |||
| #define GEMM_SMALL_KERNEL_NC ZGEMM_SMALL_KERNEL_NC | |||
| #define GEMM_SMALL_KERNEL_TN ZGEMM_SMALL_KERNEL_TN | |||
| #define GEMM_SMALL_KERNEL_TT ZGEMM_SMALL_KERNEL_TT | |||
| #define GEMM_SMALL_KERNEL_TR ZGEMM_SMALL_KERNEL_TR | |||
| #define GEMM_SMALL_KERNEL_TC ZGEMM_SMALL_KERNEL_TC | |||
| #define GEMM_SMALL_KERNEL_RN ZGEMM_SMALL_KERNEL_RN | |||
| #define GEMM_SMALL_KERNEL_RT ZGEMM_SMALL_KERNEL_RT | |||
| #define GEMM_SMALL_KERNEL_RR ZGEMM_SMALL_KERNEL_RR | |||
| #define GEMM_SMALL_KERNEL_RC ZGEMM_SMALL_KERNEL_RC | |||
| #define GEMM_SMALL_KERNEL_CN ZGEMM_SMALL_KERNEL_CN | |||
| #define GEMM_SMALL_KERNEL_CT ZGEMM_SMALL_KERNEL_CT | |||
| #define GEMM_SMALL_KERNEL_CR ZGEMM_SMALL_KERNEL_CR | |||
| #define GEMM_SMALL_KERNEL_CC ZGEMM_SMALL_KERNEL_CC | |||
| #define GEMM_SMALL_KERNEL_B0_NN ZGEMM_SMALL_KERNEL_B0_NN | |||
| #define GEMM_SMALL_KERNEL_B0_NT ZGEMM_SMALL_KERNEL_B0_NT | |||
| #define GEMM_SMALL_KERNEL_B0_NR ZGEMM_SMALL_KERNEL_B0_NR | |||
| #define GEMM_SMALL_KERNEL_B0_NC ZGEMM_SMALL_KERNEL_B0_NC | |||
| #define GEMM_SMALL_KERNEL_B0_TN ZGEMM_SMALL_KERNEL_B0_TN | |||
| #define GEMM_SMALL_KERNEL_B0_TT ZGEMM_SMALL_KERNEL_B0_TT | |||
| #define GEMM_SMALL_KERNEL_B0_TR ZGEMM_SMALL_KERNEL_B0_TR | |||
| #define GEMM_SMALL_KERNEL_B0_TC ZGEMM_SMALL_KERNEL_B0_TC | |||
| #define GEMM_SMALL_KERNEL_B0_RN ZGEMM_SMALL_KERNEL_B0_RN | |||
| #define GEMM_SMALL_KERNEL_B0_RT ZGEMM_SMALL_KERNEL_B0_RT | |||
| #define GEMM_SMALL_KERNEL_B0_RR ZGEMM_SMALL_KERNEL_B0_RR | |||
| #define GEMM_SMALL_KERNEL_B0_RC ZGEMM_SMALL_KERNEL_B0_RC | |||
| #define GEMM_SMALL_KERNEL_B0_CN ZGEMM_SMALL_KERNEL_B0_CN | |||
| #define GEMM_SMALL_KERNEL_B0_CT ZGEMM_SMALL_KERNEL_B0_CT | |||
| #define GEMM_SMALL_KERNEL_B0_CR ZGEMM_SMALL_KERNEL_B0_CR | |||
| #define GEMM_SMALL_KERNEL_B0_CC ZGEMM_SMALL_KERNEL_B0_CC | |||
| #else | |||
| #define AMAX_K CAMAX_K | |||
| @@ -2486,11 +2564,54 @@ | |||
| #define GEADD_K CGEADD_K | |||
| #define GEMM_SMALL_MATRIX_PERMIT CGEMM_SMALL_MATRIX_PERMIT | |||
| #define GEMM_SMALL_KERNEL_NN CGEMM_SMALL_KERNEL_NN | |||
| #define GEMM_SMALL_KERNEL_NT CGEMM_SMALL_KERNEL_NT | |||
| #define GEMM_SMALL_KERNEL_NR CGEMM_SMALL_KERNEL_NR | |||
| #define GEMM_SMALL_KERNEL_NC CGEMM_SMALL_KERNEL_NC | |||
| #define GEMM_SMALL_KERNEL_TN CGEMM_SMALL_KERNEL_TN | |||
| #define GEMM_SMALL_KERNEL_TT CGEMM_SMALL_KERNEL_TT | |||
| #define GEMM_SMALL_KERNEL_TR CGEMM_SMALL_KERNEL_TR | |||
| #define GEMM_SMALL_KERNEL_TC CGEMM_SMALL_KERNEL_TC | |||
| #define GEMM_SMALL_KERNEL_RN CGEMM_SMALL_KERNEL_RN | |||
| #define GEMM_SMALL_KERNEL_RT CGEMM_SMALL_KERNEL_RT | |||
| #define GEMM_SMALL_KERNEL_RR CGEMM_SMALL_KERNEL_RR | |||
| #define GEMM_SMALL_KERNEL_RC CGEMM_SMALL_KERNEL_RC | |||
| #define GEMM_SMALL_KERNEL_CN CGEMM_SMALL_KERNEL_CN | |||
| #define GEMM_SMALL_KERNEL_CT CGEMM_SMALL_KERNEL_CT | |||
| #define GEMM_SMALL_KERNEL_CR CGEMM_SMALL_KERNEL_CR | |||
| #define GEMM_SMALL_KERNEL_CC CGEMM_SMALL_KERNEL_CC | |||
| #define GEMM_SMALL_KERNEL_B0_NN CGEMM_SMALL_KERNEL_B0_NN | |||
| #define GEMM_SMALL_KERNEL_B0_NT CGEMM_SMALL_KERNEL_B0_NT | |||
| #define GEMM_SMALL_KERNEL_B0_NR CGEMM_SMALL_KERNEL_B0_NR | |||
| #define GEMM_SMALL_KERNEL_B0_NC CGEMM_SMALL_KERNEL_B0_NC | |||
| #define GEMM_SMALL_KERNEL_B0_TN CGEMM_SMALL_KERNEL_B0_TN | |||
| #define GEMM_SMALL_KERNEL_B0_TT CGEMM_SMALL_KERNEL_B0_TT | |||
| #define GEMM_SMALL_KERNEL_B0_TR CGEMM_SMALL_KERNEL_B0_TR | |||
| #define GEMM_SMALL_KERNEL_B0_TC CGEMM_SMALL_KERNEL_B0_TC | |||
| #define GEMM_SMALL_KERNEL_B0_RN CGEMM_SMALL_KERNEL_B0_RN | |||
| #define GEMM_SMALL_KERNEL_B0_RT CGEMM_SMALL_KERNEL_B0_RT | |||
| #define GEMM_SMALL_KERNEL_B0_RR CGEMM_SMALL_KERNEL_B0_RR | |||
| #define GEMM_SMALL_KERNEL_B0_RC CGEMM_SMALL_KERNEL_B0_RC | |||
| #define GEMM_SMALL_KERNEL_B0_CN CGEMM_SMALL_KERNEL_B0_CN | |||
| #define GEMM_SMALL_KERNEL_B0_CT CGEMM_SMALL_KERNEL_B0_CT | |||
| #define GEMM_SMALL_KERNEL_B0_CR CGEMM_SMALL_KERNEL_B0_CR | |||
| #define GEMM_SMALL_KERNEL_B0_CC CGEMM_SMALL_KERNEL_B0_CC | |||
| #endif | |||
| #endif | |||
| #ifndef ASSEMBLER | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\ | |||
| || defined(ARCH_LOONGARCH64) || defined(ARCH_E2K) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG sbgemm_p; | |||
| @@ -229,12 +229,7 @@ REALNAME: ;\ | |||
| #define BUFFER_SIZE ( 32 << 21) | |||
| #if defined(LOONGSON3A) | |||
| #define PAGESIZE (16UL << 10) | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #define PAGESIZE (16UL << 10) | |||
| #define FIXED_PAGESIZE (16UL << 10) | |||
| #endif | |||
| @@ -250,7 +245,7 @@ REALNAME: ;\ | |||
| #define MAP_ANONYMOUS MAP_ANON | |||
| #endif | |||
| #if defined(LOONGSON3A) || defined(LOONGSON3B) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #define PREFETCHD_(x) ld $0, x | |||
| #define PREFETCHD(x) PREFETCHD_(x) | |||
| #else | |||
| @@ -145,6 +145,19 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int (*sbgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||
| int (*sbgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sbgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, bfloat16 * A, BLASLONG lda, float alpha, bfloat16 * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| #endif | |||
| #endif | |||
| #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX) | |||
| @@ -207,6 +220,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| #endif | |||
| #ifdef BUILD_SINGLE | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int (*sgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float beta); | |||
| int (*sgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float beta, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*sgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| #endif | |||
| int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| @@ -314,6 +341,19 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); | |||
| int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | |||
| #endif | |||
| #ifdef BUILD_DOUBLE | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int (*dgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha, double beta); | |||
| int (*dgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double beta, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*dgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| #endif | |||
| int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| @@ -513,6 +553,50 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); | |||
| int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*cgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int (*cgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, float alpha0, float alpha1, float beta0, float beta1); | |||
| int (*cgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float beta0, float beta1, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| int (*cgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, float * A, BLASLONG lda, float alpha0, float alpha1, float * B, BLASLONG ldb, float * C, BLASLONG ldc); | |||
| #endif | |||
| int (*ctrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*ctrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*ctrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| @@ -679,6 +763,50 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); | |||
| int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | |||
| int (*zgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); | |||
| #ifdef SMALL_MATRIX_OPT | |||
| int (*zgemm_small_matrix_permit)(int transa, int transb, BLASLONG m, BLASLONG n, BLASLONG k, double alpha0, double alpha1, double beta0, double beta1); | |||
| int (*zgemm_small_kernel_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double beta0, double beta1, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_nn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_nt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_nr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_nc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_tn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_tt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_tr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_tc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_rn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_rt )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_rr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_rc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_cn )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_ct )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_cr )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| int (*zgemm_small_kernel_b0_cc )(BLASLONG m, BLASLONG n, BLASLONG k, double * A, BLASLONG lda, double alpha0, double alpha1, double * B, BLASLONG ldb, double * C, BLASLONG ldc); | |||
| #endif | |||
| int (*ztrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*ztrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| int (*ztrsm_kernel_LR)(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG, BLASLONG); | |||
| @@ -1069,6 +1197,8 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); | |||
| extern gotoblas_t *gotoblas; | |||
| #define FUNC_OFFSET(func) (size_t)(&((gotoblas_t *)NULL)->func) | |||
| #define DTB_ENTRIES gotoblas -> dtb_entries | |||
| #define GEMM_OFFSET_A gotoblas -> offsetA | |||
| #define GEMM_OFFSET_B gotoblas -> offsetB | |||
| @@ -1174,6 +1304,8 @@ extern gotoblas_t *gotoblas; | |||
| #else | |||
| #define FUNC_OFFSET(func) (size_t)(func) | |||
| #define DTB_ENTRIES DTB_DEFAULT_ENTRIES | |||
| #define GEMM_OFFSET_A GEMM_DEFAULT_OFFSET_A | |||
| @@ -164,6 +164,8 @@ | |||
| #define SGEADD_K sgeadd_k | |||
| #define SGEMM_SMALL_MATRIX_PERMIT sgemm_small_matrix_permit | |||
| #else | |||
| #define SAMAX_K gotoblas -> samax_k | |||
| @@ -299,8 +301,21 @@ | |||
| #define SGEADD_K gotoblas -> sgeadd_k | |||
| #define SGEMM_SMALL_MATRIX_PERMIT gotoblas -> sgemm_small_matrix_permit | |||
| #endif | |||
| #define SGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sgemm_small_kernel_nn) | |||
| #define SGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sgemm_small_kernel_nt) | |||
| #define SGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sgemm_small_kernel_tn) | |||
| #define SGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sgemm_small_kernel_tt) | |||
| #define SGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sgemm_small_kernel_b0_nn) | |||
| #define SGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sgemm_small_kernel_b0_nt) | |||
| #define SGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sgemm_small_kernel_b0_tn) | |||
| #define SGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sgemm_small_kernel_b0_tt) | |||
| #define SGEMM_NN sgemm_nn | |||
| #define SGEMM_CN sgemm_tn | |||
| #define SGEMM_TN sgemm_tn | |||
| @@ -24,6 +24,7 @@ | |||
| #define SBGEMM_BETA sbgemm_beta | |||
| #define SBGEMM_KERNEL sbgemm_kernel | |||
| #define SBGEMM_SMALL_MATRIX_PERMIT sbgemm_small_matrix_permit | |||
| #else | |||
| #define SBDOT_K gotoblas -> sbdot_k | |||
| @@ -41,8 +42,19 @@ | |||
| #define SBGEMM_BETA gotoblas -> sbgemm_beta | |||
| #define SBGEMM_KERNEL gotoblas -> sbgemm_kernel | |||
| #define SBGEMM_SMALL_MATRIX_PERMIT gotoblas -> sbgemm_small_matrix_permit | |||
| #endif | |||
| #define SBGEMM_SMALL_KERNEL_NN FUNC_OFFSET(sbgemm_small_kernel_nn) | |||
| #define SBGEMM_SMALL_KERNEL_NT FUNC_OFFSET(sbgemm_small_kernel_nt) | |||
| #define SBGEMM_SMALL_KERNEL_TN FUNC_OFFSET(sbgemm_small_kernel_tn) | |||
| #define SBGEMM_SMALL_KERNEL_TT FUNC_OFFSET(sbgemm_small_kernel_tt) | |||
| #define SBGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(sbgemm_small_kernel_b0_nn) | |||
| #define SBGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(sbgemm_small_kernel_b0_nt) | |||
| #define SBGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(sbgemm_small_kernel_b0_tn) | |||
| #define SBGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(sbgemm_small_kernel_b0_tt) | |||
| #define SBGEMM_NN sbgemm_nn | |||
| #define SBGEMM_CN sbgemm_tn | |||
| #define SBGEMM_TN sbgemm_tn | |||
| @@ -340,7 +340,8 @@ REALNAME: | |||
| .align 16; \ | |||
| .globl REALNAME ;\ | |||
| .type REALNAME, @function; \ | |||
| REALNAME: | |||
| REALNAME: \ | |||
| _CET_ENDBR | |||
| #ifdef PROFILE | |||
| #define PROFCODE call mcount | |||
| @@ -451,7 +451,8 @@ REALNAME: | |||
| .align 512; \ | |||
| .globl REALNAME ;\ | |||
| .type REALNAME, @function; \ | |||
| REALNAME: | |||
| REALNAME: \ | |||
| _CET_ENDBR | |||
| #ifdef PROFILE | |||
| #define PROFCODE call *mcount@GOTPCREL(%rip) | |||
| @@ -232,6 +232,8 @@ | |||
| #define ZGEADD_K zgeadd_k | |||
| #define ZGEMM_SMALL_MATRIX_PERMIT zgemm_small_matrix_permit | |||
| #else | |||
| #define ZAMAX_K gotoblas -> zamax_k | |||
| @@ -426,8 +428,51 @@ | |||
| #define ZGEADD_K gotoblas -> zgeadd_k | |||
| #define ZGEMM_SMALL_MATRIX_PERMIT gotoblas -> zgemm_small_matrix_permit | |||
| #endif | |||
| #define ZGEMM_SMALL_KERNEL_NN FUNC_OFFSET(zgemm_small_kernel_nn) | |||
| #define ZGEMM_SMALL_KERNEL_NT FUNC_OFFSET(zgemm_small_kernel_nt) | |||
| #define ZGEMM_SMALL_KERNEL_NR FUNC_OFFSET(zgemm_small_kernel_nr) | |||
| #define ZGEMM_SMALL_KERNEL_NC FUNC_OFFSET(zgemm_small_kernel_nc) | |||
| #define ZGEMM_SMALL_KERNEL_TN FUNC_OFFSET(zgemm_small_kernel_tn) | |||
| #define ZGEMM_SMALL_KERNEL_TT FUNC_OFFSET(zgemm_small_kernel_tt) | |||
| #define ZGEMM_SMALL_KERNEL_TR FUNC_OFFSET(zgemm_small_kernel_tr) | |||
| #define ZGEMM_SMALL_KERNEL_TC FUNC_OFFSET(zgemm_small_kernel_tc) | |||
| #define ZGEMM_SMALL_KERNEL_RN FUNC_OFFSET(zgemm_small_kernel_rn) | |||
| #define ZGEMM_SMALL_KERNEL_RT FUNC_OFFSET(zgemm_small_kernel_rt) | |||
| #define ZGEMM_SMALL_KERNEL_RR FUNC_OFFSET(zgemm_small_kernel_rr) | |||
| #define ZGEMM_SMALL_KERNEL_RC FUNC_OFFSET(zgemm_small_kernel_rc) | |||
| #define ZGEMM_SMALL_KERNEL_CN FUNC_OFFSET(zgemm_small_kernel_cn) | |||
| #define ZGEMM_SMALL_KERNEL_CT FUNC_OFFSET(zgemm_small_kernel_ct) | |||
| #define ZGEMM_SMALL_KERNEL_CR FUNC_OFFSET(zgemm_small_kernel_cr) | |||
| #define ZGEMM_SMALL_KERNEL_CC FUNC_OFFSET(zgemm_small_kernel_cc) | |||
| #define ZGEMM_SMALL_KERNEL_B0_NN FUNC_OFFSET(zgemm_small_kernel_b0_nn) | |||
| #define ZGEMM_SMALL_KERNEL_B0_NT FUNC_OFFSET(zgemm_small_kernel_b0_nt) | |||
| #define ZGEMM_SMALL_KERNEL_B0_NR FUNC_OFFSET(zgemm_small_kernel_b0_nr) | |||
| #define ZGEMM_SMALL_KERNEL_B0_NC FUNC_OFFSET(zgemm_small_kernel_b0_nc) | |||
| #define ZGEMM_SMALL_KERNEL_B0_TN FUNC_OFFSET(zgemm_small_kernel_b0_tn) | |||
| #define ZGEMM_SMALL_KERNEL_B0_TT FUNC_OFFSET(zgemm_small_kernel_b0_tt) | |||
| #define ZGEMM_SMALL_KERNEL_B0_TR FUNC_OFFSET(zgemm_small_kernel_b0_tr) | |||
| #define ZGEMM_SMALL_KERNEL_B0_TC FUNC_OFFSET(zgemm_small_kernel_b0_tc) | |||
| #define ZGEMM_SMALL_KERNEL_B0_RN FUNC_OFFSET(zgemm_small_kernel_b0_rn) | |||
| #define ZGEMM_SMALL_KERNEL_B0_RT FUNC_OFFSET(zgemm_small_kernel_b0_rt) | |||
| #define ZGEMM_SMALL_KERNEL_B0_RR FUNC_OFFSET(zgemm_small_kernel_b0_rr) | |||
| #define ZGEMM_SMALL_KERNEL_B0_RC FUNC_OFFSET(zgemm_small_kernel_b0_rc) | |||
| #define ZGEMM_SMALL_KERNEL_B0_CN FUNC_OFFSET(zgemm_small_kernel_b0_cn) | |||
| #define ZGEMM_SMALL_KERNEL_B0_CT FUNC_OFFSET(zgemm_small_kernel_b0_ct) | |||
| #define ZGEMM_SMALL_KERNEL_B0_CR FUNC_OFFSET(zgemm_small_kernel_b0_cr) | |||
| #define ZGEMM_SMALL_KERNEL_B0_CC FUNC_OFFSET(zgemm_small_kernel_b0_cc) | |||
| #define ZGEMM_NN zgemm_nn | |||
| #define ZGEMM_CN zgemm_cn | |||
| #define ZGEMM_TN zgemm_tn | |||
| @@ -1,13 +1,14 @@ | |||
| include ../Makefile.rule | |||
| TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| all :: dgemv_tester dgemm_tester | |||
| dgemv_tester : | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../libopenblas.a -lpthread -o dgemv_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemv_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemv_tester | |||
| ./dgemv_tester | |||
| dgemm_tester : dgemv_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../libopenblas.a -lpthread -o dgemm_tester | |||
| $(CXX) $(COMMON_OPT) -Wall -Wextra -Wshadow -fopenmp -std=c++11 dgemm_thread_safety.cpp ../$(LIBNAME) $(EXTRALIB) $(FEXTRALIB) -o dgemm_tester | |||
| ./dgemm_tester | |||
| clean :: | |||
| @@ -54,6 +54,7 @@ | |||
| #define VENDOR_TRANSMETA 9 | |||
| #define VENDOR_NSC 10 | |||
| #define VENDOR_HYGON 11 | |||
| #define VENDOR_ZHAOXIN 12 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -119,6 +120,7 @@ | |||
| #define CORE_SKYLAKEX 28 | |||
| #define CORE_DHYANA 29 | |||
| #define CORE_COOPERLAKE 30 | |||
| #define CORE_SAPPHIRERAPIDS 31 | |||
| #define HAVE_SSE (1 << 0) | |||
| #define HAVE_SSE2 (1 << 1) | |||
| @@ -144,6 +146,7 @@ | |||
| #define HAVE_AVX512VL (1 << 21) | |||
| #define HAVE_AVX2 (1 << 22) | |||
| #define HAVE_AVX512BF16 (1 << 23) | |||
| #define HAVE_AMXBF16 (1 << 24) | |||
| #define CACHE_INFO_L1_I 1 | |||
| #define CACHE_INFO_L1_D 2 | |||
| @@ -221,6 +224,7 @@ typedef struct { | |||
| #define CPUTYPE_SKYLAKEX 52 | |||
| #define CPUTYPE_DHYANA 53 | |||
| #define CPUTYPE_COOPERLAKE 54 | |||
| #define CPUTYPE_SAPPHIRERAPIDS 55 | |||
| #define CPUTYPE_HYGON_UNKNOWN 99 | |||
| @@ -26,20 +26,25 @@ | |||
| *****************************************************************************/ | |||
| #include <string.h> | |||
| #ifdef OS_DARWIN | |||
| #ifdef __APPLE__ | |||
| #include <sys/sysctl.h> | |||
| int32_t value; | |||
| size_t length=sizeof(value); | |||
| int64_t value64; | |||
| size_t length64=sizeof(value64); | |||
| #endif | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_ARMV8 1 | |||
| // Arm | |||
| #define CPU_CORTEXA53 2 | |||
| #define CPU_CORTEXA55 14 | |||
| #define CPU_CORTEXA57 3 | |||
| #define CPU_CORTEXA72 4 | |||
| #define CPU_CORTEXA73 5 | |||
| #define CPU_NEOVERSEN1 11 | |||
| #define CPU_NEOVERSEV1 16 | |||
| #define CPU_NEOVERSEN2 17 | |||
| // Qualcomm | |||
| #define CPU_FALKOR 6 | |||
| // Cavium | |||
| @@ -52,6 +57,8 @@ size_t length=sizeof(value); | |||
| #define CPU_EMAG8180 10 | |||
| // Apple | |||
| #define CPU_VORTEX 13 | |||
| // Fujitsu | |||
| #define CPU_A64FX 15 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -66,8 +73,12 @@ static char *cpuname[] = { | |||
| "TSV110", | |||
| "EMAG8180", | |||
| "NEOVERSEN1", | |||
| "NEOVERSEV1" | |||
| "NEOVERSEN2" | |||
| "THUNDERX3T110", | |||
| "VORTEX" | |||
| "VORTEX", | |||
| "CORTEXA55", | |||
| "A64FX" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| @@ -83,8 +94,12 @@ static char *cpuname_lower[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "vortex" | |||
| "vortex", | |||
| "cortexa55", | |||
| "a64fx" | |||
| }; | |||
| int get_feature(char *search) | |||
| @@ -161,6 +176,12 @@ int detect(void) | |||
| return CPU_CORTEXA73; | |||
| else if (strstr(cpu_part, "0xd0c")) | |||
| return CPU_NEOVERSEN1; | |||
| else if (strstr(cpu_part, "0xd40")) | |||
| return CPU_NEOVERSEV1; | |||
| else if (strstr(cpu_part, "0xd49")) | |||
| return CPU_NEOVERSEN2; | |||
| else if (strstr(cpu_part, "0xd05")) | |||
| return CPU_CORTEXA55; | |||
| } | |||
| // Qualcomm | |||
| else if (strstr(cpu_implementer, "0x51") && strstr(cpu_part, "0xc00")) | |||
| @@ -178,6 +199,9 @@ int detect(void) | |||
| // Ampere | |||
| else if (strstr(cpu_implementer, "0x50") && strstr(cpu_part, "0x000")) | |||
| return CPU_EMAG8180; | |||
| // Fujitsu | |||
| else if (strstr(cpu_implementer, "0x46") && strstr(cpu_part, "0x001")) | |||
| return CPU_A64FX; | |||
| } | |||
| p = (char *) NULL ; | |||
| @@ -207,9 +231,9 @@ int detect(void) | |||
| } | |||
| #else | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.cpufamily",&value,&length,NULL,0); | |||
| if (value ==131287967) return CPU_VORTEX; | |||
| if (value ==131287967|| value == 458787763 ) return CPU_VORTEX; | |||
| #endif | |||
| return CPU_ARMV8; | |||
| #endif | |||
| @@ -260,7 +284,7 @@ int n=0; | |||
| printf("#define NUM_CORES %d\n",n); | |||
| #endif | |||
| #ifdef DARWIN | |||
| #ifdef __APPLE__ | |||
| sysctlbyname("hw.physicalcpu_max",&value,&length,NULL,0); | |||
| printf("#define NUM_CORES %d\n",value); | |||
| #endif | |||
| @@ -280,153 +304,196 @@ void get_cpuconfig(void) | |||
| switch (d) | |||
| { | |||
| case CPU_CORTEXA53: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| case CPU_CORTEXA53: | |||
| case CPU_CORTEXA55: | |||
| printf("#define %s\n", cpuname[d]); | |||
| // Fall-through | |||
| case CPU_ARMV8: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| break; | |||
| case CPU_CORTEXA57: | |||
| case CPU_CORTEXA72: | |||
| case CPU_CORTEXA73: | |||
| case CPU_CORTEXA57: | |||
| case CPU_CORTEXA72: | |||
| case CPU_CORTEXA73: | |||
| // Common minimum settings for these Arm cores | |||
| // Can change a lot, but we need to be conservative | |||
| // TODO: detect info from /sys if possible | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 16777216\n"); | |||
| printf("#define L2_LINESIZE 128\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define THUNDERX2T99 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 49152\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 3\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 2\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEV1: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_NEOVERSEN2: | |||
| printf("#define %s\n", cpuname[d]); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 48\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_FALKOR: | |||
| printf("#define FALKOR\n"); | |||
| printf("#define L1_CODE_SIZE 65536\n"); | |||
| printf("#define L1_CODE_LINESIZE 64\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 524288\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX: | |||
| printf("#define THUNDERX\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 128\n"); | |||
| printf("#define L2_SIZE 16777216\n"); | |||
| printf("#define L2_LINESIZE 128\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| break; | |||
| case CPU_THUNDERX2T99: | |||
| printf("#define THUNDERX2T99 \n"); | |||
| printf("#define L1_CODE_SIZE 32768 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 262144 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 33554432 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_EMAG8180: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define EMAG8180\n"); | |||
| printf("#define L1_CODE_SIZE 32768\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef DARWIN | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_SIZE %d \n",value); | |||
| sysctlbyname("hw.cachelinesize",&value,&length,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %d \n",value); | |||
| sysctlbyname("hw.l1dcachesize",&value,&length,NULL,0); | |||
| printf("#define L1_DATA_SIZE %d \n",value); | |||
| sysctlbyname("hw.l2dcachesize",&value,&length,NULL,0); | |||
| printf("#define L2_SIZE %d \n",value); | |||
| break; | |||
| case CPU_TSV110: | |||
| printf("#define TSV110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 4 \n"); | |||
| printf("#define L1_DATA_SIZE 65536 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 4 \n"); | |||
| printf("#define L2_SIZE 524228 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| case CPU_EMAG8180: | |||
| // Minimum parameters for ARMv8 (based on A53) | |||
| printf("#define EMAG8180\n"); | |||
| printf("#define L1_CODE_SIZE 32768\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| case CPU_THUNDERX3T110: | |||
| printf("#define THUNDERX3T110 \n"); | |||
| printf("#define L1_CODE_SIZE 65536 \n"); | |||
| printf("#define L1_CODE_LINESIZE 64 \n"); | |||
| printf("#define L1_CODE_ASSOCIATIVE 8 \n"); | |||
| printf("#define L1_DATA_SIZE 32768 \n"); | |||
| printf("#define L1_DATA_LINESIZE 64 \n"); | |||
| printf("#define L1_DATA_ASSOCIATIVE 8 \n"); | |||
| printf("#define L2_SIZE 524288 \n"); | |||
| printf("#define L2_LINESIZE 64 \n"); | |||
| printf("#define L2_ASSOCIATIVE 8 \n"); | |||
| printf("#define L3_SIZE 94371840 \n"); | |||
| printf("#define L3_LINESIZE 64 \n"); | |||
| printf("#define L3_ASSOCIATIVE 32 \n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #ifdef __APPLE__ | |||
| case CPU_VORTEX: | |||
| printf("#define VORTEX \n"); | |||
| sysctlbyname("hw.l1icachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.cachelinesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_CODE_LINESIZE %lld \n",value64); | |||
| sysctlbyname("hw.l1dcachesize",&value64,&length64,NULL,0); | |||
| printf("#define L1_DATA_SIZE %lld \n",value64); | |||
| sysctlbyname("hw.l2cachesize",&value64,&length64,NULL,0); | |||
| printf("#define L2_SIZE %lld \n",value64); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64 \n"); | |||
| printf("#define DTB_SIZE 4096 \n"); | |||
| break; | |||
| #endif | |||
| case CPU_A64FX: | |||
| printf("#define A64FX\n"); | |||
| printf("#define L1_CODE_SIZE 65535\n"); | |||
| printf("#define L1_DATA_SIZE 65535\n"); | |||
| printf("#define L1_DATA_LINESIZE 256\n"); | |||
| printf("#define L2_SIZE 8388608\n"); | |||
| printf("#define L2_LINESIZE 256\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| break; | |||
| } | |||
| get_cpucount(); | |||
| } | |||
| @@ -0,0 +1,110 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2011-2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <stdint.h> | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_LOONGSON3R5 1 | |||
| #define LOONGARCH_CFG2 0x02 | |||
| #define LOONGARCH_LASX 1<<7 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "LOONGSON3R5" | |||
| }; | |||
| int detect(void) { | |||
| uint32_t reg = 0; | |||
| __asm__ volatile ( | |||
| "cpucfg %0, %1 \n\t" | |||
| : "+&r"(reg) | |||
| : "r"(LOONGARCH_CFG2) | |||
| ); | |||
| if (reg & LOONGARCH_LASX) | |||
| return CPU_LOONGSON3R5; | |||
| else | |||
| return CPU_UNKNOWN; | |||
| } | |||
| char *get_corename(void) { | |||
| return cpuname[detect()]; | |||
| } | |||
| void get_architecture(void) { | |||
| printf("LOONGARCH64"); | |||
| } | |||
| void get_subarchitecture(void) { | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("LOONGSON3R5"); | |||
| } else { | |||
| printf("UNKNOWN"); | |||
| } | |||
| } | |||
| void get_subdirname(void) { | |||
| printf("loongarch64"); | |||
| } | |||
| void get_cpuconfig(void) { | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| } else { | |||
| printf("#define LOONGSON3R5\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 64\n"); | |||
| printf("#define L2_SIZE 1048576\n"); | |||
| printf("#define L2_LINESIZE 64\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 16\n"); | |||
| } | |||
| } | |||
| void get_libname(void){ | |||
| if (detect() == CPU_LOONGSON3R5) { | |||
| printf("loongson3r5\n"); | |||
| } else { | |||
| printf("loongarch64\n"); | |||
| } | |||
| } | |||
| @@ -165,6 +165,7 @@ void get_cpuconfig(void){ | |||
| }else{ | |||
| printf("#define UNKNOWN\n"); | |||
| } | |||
| if (!get_feature("msa")) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| @@ -178,3 +179,38 @@ void get_libname(void){ | |||
| printf("mips\n"); | |||
| } | |||
| } | |||
| int get_feature(char *search) | |||
| { | |||
| #ifdef __linux | |||
| FILE *infile; | |||
| char buffer[2048], *p,*t; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if( p == NULL ) return 0; | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (strstr(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| return(0); | |||
| } | |||
| @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| /* or implied, of The University of Texas at Austin. */ | |||
| /*********************************************************************/ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3A 2 | |||
| #define CPU_LOONGSON3B 3 | |||
| #define CPU_I6400 4 | |||
| #define CPU_P6600 5 | |||
| #define CPU_I6500 6 | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_SICORTEX 1 | |||
| #define CPU_LOONGSON3R3 2 | |||
| #define CPU_LOONGSON3R4 3 | |||
| #define CPU_I6400 4 | |||
| #define CPU_P6600 5 | |||
| #define CPU_I6500 6 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "SICORTEX", | |||
| "LOONGSON3A", | |||
| "LOONGSON3B", | |||
| "LOONGSON3R3", | |||
| "LOONGSON3R4", | |||
| "I6400", | |||
| "P6600", | |||
| "I6500" | |||
| @@ -90,48 +90,13 @@ static char *cpuname[] = { | |||
| int detect(void){ | |||
| #ifdef __linux | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("cpu", buffer, 3)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| }else if (strstr(p, "Loongson-3")){ | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("system type", buffer, 11)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "loongson3a")) | |||
| return CPU_LOONGSON3A; | |||
| }else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| p = (char *)NULL; | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| @@ -139,12 +104,14 @@ int detect(void){ | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A")){ | |||
| return CPU_LOONGSON3A; | |||
| }else if(strstr(p, "Loongson-3B")){ | |||
| return CPU_LOONGSON3B; | |||
| } | |||
| if (p != NULL){ | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ | |||
| return CPU_LOONGSON3R3; | |||
| } else if (strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ | |||
| return CPU_LOONGSON3R4; | |||
| } else{ | |||
| return CPU_SICORTEX; | |||
| } | |||
| } | |||
| #endif | |||
| return CPU_UNKNOWN; | |||
| @@ -159,10 +126,10 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("LOONGSON3A"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("LOONGSON3B"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("LOONGSON3R3"); | |||
| }else if(detect()==CPU_LOONGSON3R4){ | |||
| printf("LOONGSON3R4"); | |||
| }else if(detect()==CPU_I6400){ | |||
| printf("I6400"); | |||
| }else if(detect()==CPU_P6600){ | |||
| @@ -179,8 +146,8 @@ void get_subdirname(void){ | |||
| } | |||
| void get_cpuconfig(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("#define LOONGSON3A\n"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("#define LOONGSON3R3\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| @@ -188,8 +155,8 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_DEFAULT_ENTRIES 64\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| }else if(detect()==CPU_LOONGSON3B){ | |||
| printf("#define LOONGSON3B\n"); | |||
| }else if(detect()==CPU_LOONGSON3R4){ | |||
| printf("#define LOONGSON3R4\n"); | |||
| printf("#define L1_DATA_SIZE 65536\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 512488\n"); | |||
| @@ -234,13 +201,14 @@ void get_cpuconfig(void){ | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 8\n"); | |||
| } | |||
| if (!get_feature("msa")) printf("#define NO_MSA\n"); | |||
| } | |||
| void get_libname(void){ | |||
| if(detect()==CPU_LOONGSON3A) { | |||
| printf("loongson3a\n"); | |||
| }else if(detect()==CPU_LOONGSON3B) { | |||
| printf("loongson3b\n"); | |||
| if(detect()==CPU_LOONGSON3R3) { | |||
| printf("loongson3r3\n"); | |||
| }else if(detect()==CPU_LOONGSON3R4) { | |||
| printf("loongson3r4\n"); | |||
| }else if(detect()==CPU_I6400) { | |||
| printf("i6400\n"); | |||
| }else if(detect()==CPU_P6600) { | |||
| @@ -251,3 +219,38 @@ void get_libname(void){ | |||
| printf("mips64\n"); | |||
| } | |||
| } | |||
| int get_feature(char *search) | |||
| { | |||
| #ifdef __linux | |||
| FILE *infile; | |||
| char buffer[2048], *p,*t; | |||
| p = (char *) NULL ; | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) | |||
| { | |||
| if (!strncmp("Features", buffer, 8) || !strncmp("ASEs implemented", buffer, 16)) | |||
| { | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if( p == NULL ) return 0; | |||
| t = strtok(p," "); | |||
| while( t = strtok(NULL," ")) | |||
| { | |||
| if (strstr(t, search)) { return(1); } | |||
| } | |||
| #endif | |||
| return(0); | |||
| } | |||
| @@ -1,3 +1,4 @@ | |||
| //{ | |||
| /*********************************************************************/ | |||
| /* Copyright 2009, 2010 The University of Texas at Austin. */ | |||
| /* All rights reserved. */ | |||
| @@ -266,6 +267,31 @@ int support_avx512_bf16(){ | |||
| #endif | |||
| } | |||
| #define BIT_AMX_TILE 0x01000000 | |||
| #define BIT_AMX_BF16 0x00400000 | |||
| #define BIT_AMX_ENBD 0x00060000 | |||
| int support_amx_bf16() { | |||
| #if !defined(NO_AVX) && !defined(NO_AVX512) | |||
| int eax, ebx, ecx, edx; | |||
| int ret=0; | |||
| if (!support_avx512()) | |||
| return 0; | |||
| // CPUID.7.0:EDX indicates AMX support | |||
| cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { | |||
| // CPUID.D.0:EAX[17:18] indicates AMX enabled | |||
| cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); | |||
| if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) | |||
| ret = 1; | |||
| } | |||
| return ret; | |||
| #else | |||
| return 0; | |||
| #endif | |||
| } | |||
| int get_vendor(void){ | |||
| int eax, ebx, ecx, edx; | |||
| char vendor[13]; | |||
| @@ -283,6 +309,7 @@ int get_vendor(void){ | |||
| if (!strcmp(vendor, "CyrixInstead")) return VENDOR_CYRIX; | |||
| if (!strcmp(vendor, "NexGenDriven")) return VENDOR_NEXGEN; | |||
| if (!strcmp(vendor, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor, " Shanghai ")) return VENDOR_ZHAOXIN; | |||
| if (!strcmp(vendor, "RiseRiseRise")) return VENDOR_RISE; | |||
| if (!strcmp(vendor, " SiS SiS SiS")) return VENDOR_SIS; | |||
| if (!strcmp(vendor, "GenuineTMx86")) return VENDOR_TRANSMETA; | |||
| @@ -296,9 +323,11 @@ int get_vendor(void){ | |||
| int get_cputype(int gettype){ | |||
| int eax, ebx, ecx, edx; | |||
| /* | |||
| int extend_family, family; | |||
| int extend_model, model; | |||
| int type, stepping; | |||
| */ | |||
| int feature = 0; | |||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||
| @@ -352,6 +381,7 @@ int get_cputype(int gettype){ | |||
| if (support_avx2()) feature |= HAVE_AVX2; | |||
| if (support_avx512()) feature |= HAVE_AVX512VL; | |||
| if (support_avx512_bf16()) feature |= HAVE_AVX512BF16; | |||
| if (support_amx_bf16()) feature |= HAVE_AMXBF16; | |||
| if ((ecx & (1 << 12)) != 0) feature |= HAVE_FMA3; | |||
| #endif | |||
| @@ -400,7 +430,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| cpuid(0, &cpuid_level, &ebx, &ecx, &edx); | |||
| if (cpuid_level > 1) { | |||
| int numcalls =0 ; | |||
| int numcalls; | |||
| cpuid(2, &eax, &ebx, &ecx, &edx); | |||
| numcalls = BITMASK(eax, 0, 0xff); //FIXME some systems may require repeated calls to read all entries | |||
| info[ 0] = BITMASK(eax, 8, 0xff); | |||
| @@ -1066,7 +1097,8 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| if ((get_vendor() == VENDOR_AMD) || | |||
| (get_vendor() == VENDOR_HYGON) || | |||
| (get_vendor() == VENDOR_CENTAUR)) { | |||
| (get_vendor() == VENDOR_CENTAUR) || | |||
| (get_vendor() == VENDOR_ZHAOXIN)) { | |||
| cpuid(0x80000005, &eax, &ebx, &ecx, &edx); | |||
| LDTB.size = 4096; | |||
| @@ -1189,7 +1221,7 @@ int get_cacheinfo(int type, cache_info_t *cacheinfo){ | |||
| int get_cpuname(void){ | |||
| int family, exfamily, model, vendor, exmodel; | |||
| int family, exfamily, model, vendor, exmodel, stepping; | |||
| if (!have_cpuid()) return CPUTYPE_80386; | |||
| @@ -1197,6 +1229,7 @@ int get_cpuname(void){ | |||
| exfamily = get_cputype(GET_EXFAMILY); | |||
| model = get_cputype(GET_MODEL); | |||
| exmodel = get_cputype(GET_EXMODEL); | |||
| stepping = get_cputype(GET_STEPPING); | |||
| vendor = get_vendor(); | |||
| @@ -1398,6 +1431,17 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 10: // Ice Lake SP | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 7: // family 6 exmodel 7 | |||
| @@ -1415,9 +1459,18 @@ int get_cpuname(void){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| switch (model) { | |||
| case 12: // Tiger Lake | |||
| case 13: // Tiger Lake (11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz) | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| @@ -1425,21 +1478,74 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| case 10: //family 6 exmodel 10 | |||
| case 15: // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| case 7: // Alder Lake desktop | |||
| case 10: // Alder Lake mobile | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| return CPUTYPE_NEHALEM; | |||
| case 13: // Ice Lake NNPI | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 14: // Kaby Lake and refreshes | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: //family 6 exmodel 10 | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| case 7: // Rocket Lake | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| break; | |||
| case 0x7: | |||
| return CPUTYPE_ITANIUM; | |||
| case 0xf: | |||
| @@ -1538,7 +1644,6 @@ int get_cpuname(void){ | |||
| else | |||
| return CPUTYPE_BARCELONA; | |||
| } | |||
| break; | |||
| case 10: // Zen3 | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| @@ -1598,13 +1703,20 @@ int get_cpuname(void){ | |||
| switch (family) { | |||
| case 0x5: | |||
| return CPUTYPE_CENTAURC6; | |||
| break; | |||
| case 0x6: | |||
| return CPUTYPE_NANO; | |||
| break; | |||
| if (model == 0xf && stepping < 0xe) | |||
| return CPUTYPE_NANO; | |||
| return CPUTYPE_NEHALEM; | |||
| default: | |||
| if (family >= 0x7) | |||
| return CPUTYPE_NEHALEM; | |||
| else | |||
| return CPUTYPE_VIAC3; | |||
| } | |||
| return CPUTYPE_VIAC3; | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN){ | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| if (vendor == VENDOR_RISE){ | |||
| @@ -1837,7 +1949,7 @@ char *get_lower_cpunamechar(void){ | |||
| int get_coretype(void){ | |||
| int family, exfamily, model, exmodel, vendor; | |||
| int family, exfamily, model, exmodel, vendor, stepping; | |||
| if (!have_cpuid()) return CORE_80486; | |||
| @@ -1845,6 +1957,7 @@ int get_coretype(void){ | |||
| exfamily = get_cputype(GET_EXFAMILY); | |||
| model = get_cputype(GET_MODEL); | |||
| exmodel = get_cputype(GET_EXMODEL); | |||
| stepping = get_cputype(GET_STEPPING); | |||
| vendor = get_vendor(); | |||
| @@ -2002,19 +2115,7 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| case 6: | |||
| @@ -2068,6 +2169,7 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 6: | |||
| if (model == 6) | |||
| #ifndef NO_AVX512 | |||
| @@ -2081,12 +2183,27 @@ int get_coretype(void){ | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| #endif | |||
| if (model == 10 || model == 12) | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512_bf16()) | |||
| return CORE_COOPERLAKE; | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| case 7: | |||
| if (model == 10) | |||
| return CORE_NEHALEM; | |||
| if (model == 14) | |||
| if (model == 13 || model == 14) // Ice Lake | |||
| #ifndef NO_AVX512 | |||
| return CORE_SKYLAKEX; | |||
| #else | |||
| @@ -2100,9 +2217,19 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| #endif | |||
| break; | |||
| case 9: | |||
| case 8: | |||
| if (model == 14) { // Kaby Lake | |||
| if (model == 12 || model == 13) { // Tiger Lake | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake mobile | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| @@ -2112,12 +2239,82 @@ int get_coretype(void){ | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 15) { // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return CPUTYPE_COOPERLAKE; | |||
| if(support_avx512()) | |||
| return CPUTYPE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| case 9: | |||
| if (model == 7 || model == 10) { // Alder Lake | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 13) { // Ice Lake NNPI | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| if (model == 14) { // Kaby Lake desktop | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| case 7:// Rocket Lake | |||
| #ifndef NO_AVX512 | |||
| if(support_avx512()) | |||
| return CORE_SKYLAKEX; | |||
| #endif | |||
| #ifndef NO_AVX2 | |||
| if(support_avx2()) | |||
| return CORE_HASWELL; | |||
| #endif | |||
| if(support_avx()) | |||
| return CORE_SANDYBRIDGE; | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 15: | |||
| if (model <= 0x2) return CORE_NORTHWOOD; | |||
| else return CORE_PRESCOTT; | |||
| } | |||
| } | |||
| } | |||
| @@ -2216,10 +2413,19 @@ int get_coretype(void){ | |||
| if (vendor == VENDOR_CENTAUR) { | |||
| switch (family) { | |||
| case 0x6: | |||
| return CORE_NANO; | |||
| break; | |||
| if (model == 0xf && stepping < 0xe) | |||
| return CORE_NANO; | |||
| return CORE_NEHALEM; | |||
| default: | |||
| if (family >= 0x7) | |||
| return CORE_NEHALEM; | |||
| else | |||
| return CORE_VIAC3; | |||
| } | |||
| return CORE_VIAC3; | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN) { | |||
| return CORE_NEHALEM; | |||
| } | |||
| return CORE_UNKNOWN; | |||
| @@ -2302,6 +2508,7 @@ void get_cpuconfig(void){ | |||
| if (features & HAVE_AVX2 ) printf("#define HAVE_AVX2\n"); | |||
| if (features & HAVE_AVX512VL ) printf("#define HAVE_AVX512VL\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("#define HAVE_AVX512BF16\n"); | |||
| if (features & HAVE_AMXBF16 ) printf("#define HAVE_AMXBF16\n"); | |||
| if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); | |||
| if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); | |||
| if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); | |||
| @@ -2373,9 +2580,11 @@ void get_sse(void){ | |||
| if (features & HAVE_AVX2 ) printf("HAVE_AVX2=1\n"); | |||
| if (features & HAVE_AVX512VL ) printf("HAVE_AVX512VL=1\n"); | |||
| if (features & HAVE_AVX512BF16 ) printf("HAVE_AVX512BF16=1\n"); | |||
| if (features & HAVE_AMXBF16 ) printf("HAVE_AMXBF16=1\n"); | |||
| if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); | |||
| if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); | |||
| if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); | |||
| if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n"); | |||
| } | |||
| //} | |||
| @@ -27,57 +27,11 @@ | |||
| #include <string.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_Z15 3 | |||
| #include "cpuid_zarch.h" | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13", | |||
| "Z14", | |||
| "Z15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13", | |||
| "z14", | |||
| "z15" | |||
| }; | |||
| int detect(void) | |||
| { | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| infile = fopen("/proc/sysinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("Type", buffer, 4)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if (strstr(p, "2964")) return CPU_Z13; | |||
| if (strstr(p, "2965")) return CPU_Z13; | |||
| if (strstr(p, "3906")) return CPU_Z14; | |||
| if (strstr(p, "3907")) return CPU_Z14; | |||
| if (strstr(p, "8561")) return CPU_Z14; // fallback z15 to z14 | |||
| if (strstr(p, "8562")) return CPU_Z14; // fallback z15 to z14 | |||
| return CPU_GENERIC; | |||
| } | |||
| void get_libname(void) | |||
| { | |||
| int d = detect(); | |||
| printf("%s", cpuname_lower[d]); | |||
| } | |||
| @@ -0,0 +1,101 @@ | |||
| #include <stdlib.h> | |||
| #define CPU_GENERIC 0 | |||
| #define CPU_Z13 1 | |||
| #define CPU_Z14 2 | |||
| #define CPU_Z15 3 | |||
| static char *cpuname[] = { | |||
| "ZARCH_GENERIC", | |||
| "Z13", | |||
| "Z14", | |||
| "Z15" | |||
| }; | |||
| static char *cpuname_lower[] = { | |||
| "zarch_generic", | |||
| "z13", | |||
| "z14", | |||
| "z15" | |||
| }; | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| static int detect(void) | |||
| { | |||
| unsigned long hwcap = get_hwcap(); | |||
| // Choose the architecture level for optimized kernels based on hardware | |||
| // capability bits (just like glibc chooses optimized implementations). | |||
| // | |||
| // The hardware capability bits that are used here indicate both | |||
| // hardware support for a particular ISA extension and the presence of | |||
| // software support to enable its use. For example, when HWCAP_S390_VX | |||
| // is set then both the CPU can execute SIMD instructions and the Linux | |||
| // kernel can manage applications using the vector registers and SIMD | |||
| // instructions. | |||
| // | |||
| // See glibc's sysdeps/s390/dl-procinfo.h for an overview (also in | |||
| // sysdeps/unix/sysv/linux/s390/bits/hwcap.h) of the defined hardware | |||
| // capability bits. They are derived from the information that the | |||
| // "store facility list (extended)" instructions provide. | |||
| // (https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/s390/dl-procinfo.h;hb=HEAD) | |||
| // | |||
| // currently used: | |||
| // HWCAP_S390_VX - vector facility for z/Architecture (introduced with | |||
| // IBM z13), enables level CPU_Z13 (SIMD) | |||
| // HWCAP_S390_VXE - vector enhancements facility 1 (introduced with IBM | |||
| // z14), together with VX enables level CPU_Z14 | |||
| // (single-precision SIMD instructions) | |||
| // | |||
| // When you add optimized kernels that make use of other ISA extensions | |||
| // (e.g., for exploiting the vector-enhancements facility 2 that was introduced | |||
| // with IBM z15), then add a new architecture level (e.g., CPU_Z15) and gate | |||
| // it on the hwcap that represents it here (e.g., HWCAP_S390_VXRS_EXT2 | |||
| // for the z15 vector enhancements). | |||
| // | |||
| // To learn the value of hwcaps on a given system, set the environment | |||
| // variable LD_SHOW_AUXV and let ld.so dump it (e.g., by running | |||
| // LD_SHOW_AUXV=1 /bin/true). | |||
| // Also, the init function for dynamic arch support will print hwcaps | |||
| // when OPENBLAS_VERBOSE is set to 2 or higher. | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| return CPU_Z14; | |||
| if (hwcap & HWCAP_S390_VX) | |||
| return CPU_Z13; | |||
| return CPU_GENERIC; | |||
| } | |||
| @@ -84,7 +84,7 @@ OS_AIX | |||
| OS_OSF | |||
| #endif | |||
| #if defined(__WIN32) || defined(__WIN64) || defined(__WINNT) | |||
| #if defined(__WIN32) || defined(__WIN64) || defined(_WIN32) || defined(_WIN64) || defined(__WINNT) | |||
| OS_WINNT | |||
| #endif | |||
| @@ -141,7 +141,7 @@ ARCH_SPARC | |||
| ARCH_IA64 | |||
| #endif | |||
| #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) | |||
| #if defined(__LP64) || defined(__LP64__) || defined(__ptr64) || defined(__x86_64__) || defined(__amd64__) || defined(__64BIT__) || defined(__aarch64__) | |||
| BINARY_64 | |||
| #endif | |||
| @@ -157,7 +157,15 @@ ARCH_ARM64 | |||
| ARCH_RISCV64 | |||
| #endif | |||
| #ifdef __loongarch64 | |||
| ARCH_LOONGARCH64 | |||
| #endif | |||
| #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) | |||
| HAVE_C11 | |||
| #endif | |||
| #if defined(__e2k__) | |||
| ARCH_E2K | |||
| #endif | |||
| @@ -4,10 +4,22 @@ include_directories(${PROJECT_BINARY_DIR}) | |||
| enable_language(Fortran) | |||
| set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DADD${BU} -DCBLAS") | |||
| if (CMAKE_Fortran_COMPILER_ID STREQUAL GNU) | |||
| set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fno-tree-vectorize") | |||
| endif() | |||
| if(WIN32) | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1 | |||
| "$ErrorActionPreference = \"Stop\"\n" | |||
| "Get-Content $args[1] | & $args[0]\n" | |||
| ) | |||
| set(test_helper powershell -ExecutionPolicy Bypass "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.ps1") | |||
| else() | |||
| FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh | |||
| "$1 < $2\n" | |||
| ) | |||
| set(test_helper sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh") | |||
| endif() | |||
| foreach(float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char_upper) | |||
| @@ -21,7 +33,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| c_${float_char}blas1.c) | |||
| target_link_libraries(x${float_char}cblat1 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat1" | |||
| COMMAND "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat1") | |||
| COMMAND $<TARGET_FILE:x${float_char}cblat1>) | |||
| #level2 | |||
| add_executable(x${float_char}cblat2 | |||
| @@ -33,7 +45,7 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| constant.c) | |||
| target_link_libraries(x${float_char}cblat2 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat2" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat2" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat2> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in2") | |||
| #level3 | |||
| add_executable(x${float_char}cblat3 | |||
| @@ -45,6 +57,6 @@ foreach(float_type ${FLOAT_TYPES}) | |||
| constant.c) | |||
| target_link_libraries(x${float_char}cblat3 ${OpenBLAS_LIBNAME}) | |||
| add_test(NAME "x${float_char}cblat3" | |||
| COMMAND sh "${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh" "${CMAKE_CURRENT_BINARY_DIR}/x${float_char}cblat3" "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| COMMAND ${test_helper} $<TARGET_FILE:x${float_char}cblat3> "${PROJECT_SOURCE_DIR}/ctest/${float_char}in3") | |||
| endforeach() | |||
| @@ -6,6 +6,9 @@ TOPDIR = .. | |||
| include $(TOPDIR)/Makefile.system | |||
| override CFLAGS += -DADD$(BU) -DCBLAS | |||
| ifeq ($(F_COMPILER),GFORTRAN) | |||
| override FFLAGS += -fno-tree-vectorize | |||
| endif | |||
| override TARGET_ARCH= | |||
| override TARGET_MACH= | |||
| @@ -212,6 +215,9 @@ ifeq ($(C_COMPILER), CLANG) | |||
| CEXTRALIB = -lomp | |||
| endif | |||
| endif | |||
| ifeq ($(F_COMPILER), NAG) | |||
| CEXTRALIB = -lgomp | |||
| endif | |||
| endif | |||
| ifeq ($(BUILD_SINGLE),1) | |||
| @@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); | |||
| A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *ku+*kl+2; | |||
| A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| for( i=0; i<*ku; i++ ){ | |||
| irow=*ku+*kl-i; | |||
| jcol=(*ku)-i; | |||
| @@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; | |||
| *incx, beta, y, *incy ); | |||
| else { | |||
| LDA = *k+2; | |||
| A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| beta, y, *incy); | |||
| else { | |||
| LDA = *n; | |||
| A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); | |||
| AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* | |||
| A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); | |||
| AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_COMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, | |||
| x, *incx); | |||
| else { | |||
| LDA = *k+2; | |||
| A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, | |||
| *incx); | |||
| else { | |||
| LDA = *k+2; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX )); | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn, | |||
| cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof(CBLAS_TEST_COMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn, | |||
| cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof(CBLAS_TEST_COMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA=*n+1; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha, | |||
| cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); | |||
| else { | |||
| LDA = *n; | |||
| A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* | |||
| A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_COMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| *incy, ap ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_COMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); | |||
| A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX )); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| @@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| @@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) { | |||
| for( j=0; j<*n; j++ ) | |||
| @@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *ku+*kl+2; | |||
| A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*ku; i++ ){ | |||
| irow=*ku+*kl-i; | |||
| jcol=(*ku)-i; | |||
| @@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( double* )malloc( LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -411,8 +411,8 @@ void F77_dtpmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( double* )malloc( LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -451,8 +451,8 @@ void F77_dtpsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( double* )malloc( LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -488,8 +488,8 @@ void F77_dspr(int *order, char *uplow, int *n, double *alpha, double *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( double* )malloc( LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -540,8 +540,8 @@ void F77_dspr2(int *order, char *uplow, int *n, double *alpha, double *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( double* )malloc( LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); | |||
| AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -26,34 +26,34 @@ void F77_dgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (transa == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A = (double *)malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = (double *)malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else { | |||
| LDA = *m+1; | |||
| A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| if (transb == CblasNoTrans) { | |||
| LDB = *n+1; | |||
| B = ( double* )malloc( (*k)*LDB*sizeof( double ) ); | |||
| B = ( double* )malloc( (*k)*(size_t)LDB*sizeof( double ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| } | |||
| else { | |||
| LDB = *k+1; | |||
| B = ( double* )malloc( LDB*(*n)*sizeof( double ) ); | |||
| B = ( double* )malloc( (size_t)LDB*(*n)*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| } | |||
| LDC = *n+1; | |||
| C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); | |||
| C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) ); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -89,25 +89,25 @@ void F77_dsymm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); | |||
| B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| LDC = *n+1; | |||
| C = ( double* )malloc( (*m)*LDC*sizeof( double ) ); | |||
| C = ( double* )malloc( (*m)*(size_t)LDC*sizeof( double ) ); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -143,20 +143,20 @@ void F77_dsyrk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*k)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*k)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDC = *n+1; | |||
| C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); | |||
| C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -191,8 +191,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| LDB = *k+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| B = ( double* )malloc( (*n)*LDB*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| B = ( double* )malloc( (*n)*(size_t)LDB*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| @@ -202,8 +202,8 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| else { | |||
| LDA = *n+1; | |||
| LDB = *n+1; | |||
| A = ( double* )malloc( LDA*(*k)*sizeof( double ) ); | |||
| B = ( double* )malloc( LDB*(*k)*sizeof( double ) ); | |||
| A = ( double* )malloc( (size_t)LDA*(*k)*sizeof( double ) ); | |||
| B = ( double* )malloc( (size_t)LDB*(*k)*sizeof( double ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| @@ -211,7 +211,7 @@ void F77_dsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C = ( double* )malloc( (*n)*LDC*sizeof( double ) ); | |||
| C = ( double* )malloc( (*n)*(size_t)LDC*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -249,20 +249,20 @@ void F77_dtrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); | |||
| B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| @@ -300,20 +300,20 @@ void F77_dtrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); | |||
| A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( double* )malloc( (*m)*LDB*sizeof( double ) ); | |||
| B = ( double* )malloc( (*m)*(size_t)LDB*sizeof( double ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| @@ -19,7 +19,7 @@ void F77_sgemv(int *order, char *transp, int *m, int *n, float *alpha, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -43,7 +43,7 @@ void F77_sger(int *order, int *m, int *n, float *alpha, float *x, int *incx, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) { | |||
| for( j=0; j<*n; j++ ) | |||
| @@ -74,7 +74,7 @@ void F77_strmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -102,7 +102,7 @@ void F77_strsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -123,7 +123,7 @@ void F77_ssymv(int *order, char *uplow, int *n, float *alpha, float *a, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -146,7 +146,7 @@ void F77_ssyr(int *order, char *uplow, int *n, float *alpha, float *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -170,7 +170,7 @@ void F77_ssyr2(int *order, char *uplow, int *n, float *alpha, float *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[ LDA*i+j ]=a[ (*lda)*j+i ]; | |||
| @@ -196,7 +196,7 @@ void F77_sgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *ku+*kl+2; | |||
| A = ( float* )malloc( (*n+*kl)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n+*kl)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*ku; i++ ){ | |||
| irow=*ku+*kl-i; | |||
| jcol=(*ku)-i; | |||
| @@ -236,7 +236,7 @@ void F77_stbmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -282,7 +282,7 @@ void F77_stbsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -325,7 +325,7 @@ void F77_ssbmv(int *order, char *uplow, int *n, int *k, float *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *k+1; | |||
| A = ( float* )malloc( (*n+*k)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n+*k)*(size_t)LDA*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -369,8 +369,8 @@ void F77_sspmv(int *order, char *uplow, int *n, float *alpha, float *ap, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( float* )malloc( LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -410,8 +410,8 @@ void F77_stpmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( float* )malloc( LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -449,8 +449,8 @@ void F77_stpsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( float* )malloc( LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -485,8 +485,8 @@ void F77_sspr(int *order, char *uplow, int *n, float *alpha, float *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( float* )malloc( LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -536,8 +536,8 @@ void F77_sspr2(int *order, char *uplow, int *n, float *alpha, float *x, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n; | |||
| A = ( float* )malloc( LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( (((LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*LDA*sizeof( float ) ); | |||
| AP = ( float* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( float ) ); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| for( i=0; i<j+1; i++, k++ ) | |||
| @@ -23,34 +23,34 @@ void F77_sgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (transa == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A = (float *)malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = (float *)malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else { | |||
| LDA = *m+1; | |||
| A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| if (transb == CblasNoTrans) { | |||
| LDB = *n+1; | |||
| B = ( float* )malloc( (*k)*LDB*sizeof( float ) ); | |||
| B = ( float* )malloc( (*k)*(size_t)LDB*sizeof( float ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| } | |||
| else { | |||
| LDB = *k+1; | |||
| B = ( float* )malloc( LDB*(*n)*sizeof( float ) ); | |||
| B = ( float* )malloc( (size_t)LDB*(*n)*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| } | |||
| LDC = *n+1; | |||
| C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); | |||
| C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) ); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -85,25 +85,25 @@ void F77_ssymm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); | |||
| B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| LDC = *n+1; | |||
| C = ( float* )malloc( (*m)*LDC*sizeof( float ) ); | |||
| C = ( float* )malloc( (*m)*(size_t)LDC*sizeof( float ) ); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -139,20 +139,20 @@ void F77_ssyrk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*k)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*k)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDC = *n+1; | |||
| C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); | |||
| C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -187,8 +187,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| LDB = *k+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| B = ( float* )malloc( (*n)*LDB*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| B = ( float* )malloc( (*n)*(size_t)LDB*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| @@ -198,8 +198,8 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| else { | |||
| LDA = *n+1; | |||
| LDB = *n+1; | |||
| A = ( float* )malloc( LDA*(*k)*sizeof( float ) ); | |||
| B = ( float* )malloc( LDB*(*k)*sizeof( float ) ); | |||
| A = ( float* )malloc( (size_t)LDA*(*k)*sizeof( float ) ); | |||
| B = ( float* )malloc( (size_t)LDB*(*k)*sizeof( float ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| @@ -207,7 +207,7 @@ void F77_ssyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C = ( float* )malloc( (*n)*LDC*sizeof( float ) ); | |||
| C = ( float* )malloc( (*n)*(size_t)LDC*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -245,20 +245,20 @@ void F77_strmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); | |||
| B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| @@ -296,20 +296,20 @@ void F77_strsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A = ( float* )malloc( (*m)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*m)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A = ( float* )malloc( (*n)*LDA*sizeof( float ) ); | |||
| A = ( float* )malloc( (*n)*(size_t)LDA*sizeof( float ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B = ( float* )malloc( (*m)*LDB*sizeof( float ) ); | |||
| B = ( float* )malloc( (*m)*(size_t)LDB*sizeof( float ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| @@ -20,7 +20,7 @@ void F77_zgemv(int *order, char *transp, int *m, int *n, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); | |||
| A = (CBLAS_TEST_ZOMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -50,7 +50,7 @@ void F77_zgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, | |||
| get_transpose_type(transp, &trans); | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *ku+*kl+2; | |||
| A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=( CBLAS_TEST_ZOMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*ku; i++ ){ | |||
| irow=*ku+*kl-i; | |||
| jcol=(*ku)-i; | |||
| @@ -94,7 +94,7 @@ void F77_zgeru(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -122,7 +122,7 @@ void F77_zgerc(int *order, int *m, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -154,7 +154,7 @@ void F77_zhemv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A = (CBLAS_TEST_ZOMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; | |||
| *incx, beta, y, *incy ); | |||
| else { | |||
| LDA = *k+2; | |||
| A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A =(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -251,8 +251,8 @@ void F77_zhpmv(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| beta, y, *incy); | |||
| else { | |||
| LDA = *n; | |||
| A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| AP = (CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* | |||
| A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| AP = (CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_ZOMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -311,7 +311,7 @@ void F77_ztbmv(int *order, char *uplow, char *transp, char *diagn, | |||
| x, *incx); | |||
| else { | |||
| LDA = *k+2; | |||
| A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -375,7 +375,7 @@ void F77_ztbsv(int *order, char *uplow, char *transp, char *diagn, | |||
| *incx); | |||
| else { | |||
| LDA = *k+2; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( i=0; i<*k; i++ ){ | |||
| irow=*k-i; | |||
| @@ -436,8 +436,8 @@ void F77_ztpmv(int *order, char *uplow, char *transp, char *diagn, | |||
| cblas_ztpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -491,8 +491,8 @@ void F77_ztpsv(int *order, char *uplow, char *transp, char *diagn, | |||
| cblas_ztpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc((((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -544,7 +544,7 @@ void F77_ztrmv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA=*n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -573,7 +573,7 @@ void F77_ztrsv(int *order, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A =(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; | |||
| @@ -601,8 +601,8 @@ void F77_zhpr(int *order, char *uplow, int *n, double *alpha, | |||
| cblas_zhpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); | |||
| else { | |||
| LDA = *n; | |||
| A = (CBLAS_TEST_ZOMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| AP = ( CBLAS_TEST_ZOMPLEX* )malloc( (((LDA+1)*LDA)/2)* | |||
| A = (CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| AP = ( CBLAS_TEST_ZOMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_ZOMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -678,8 +678,8 @@ void F77_zhpr2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| *incy, ap ); | |||
| else { | |||
| LDA = *n; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc( (((LDA+1)*LDA)/2)* | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| AP=(CBLAS_TEST_ZOMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* | |||
| sizeof( CBLAS_TEST_ZOMPLEX )); | |||
| if (uplo == CblasUpper) { | |||
| for( j=0, k=0; j<*n; j++ ) | |||
| @@ -750,7 +750,7 @@ void F77_zher(int *order, char *uplow, int *n, double *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_ZOMPLEX )); | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_ZOMPLEX )); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| @@ -784,7 +784,7 @@ void F77_zher2(int *order, char *uplow, int *n, CBLAS_TEST_ZOMPLEX *alpha, | |||
| if (*order == TEST_ROW_MJR) { | |||
| LDA = *n+1; | |||
| A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A= ( CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| @@ -26,7 +26,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (transa == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -35,7 +35,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| } | |||
| else { | |||
| LDA = *m+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*m; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -45,7 +45,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| if (transb == CblasNoTrans) { | |||
| LDB = *n+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| B[i*LDB+j].real=b[j*(*ldb)+i].real; | |||
| @@ -54,7 +54,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| } | |||
| else { | |||
| LDB = *k+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*n)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| B[i*LDB+j].real=b[j*(*ldb)+i].real; | |||
| @@ -63,7 +63,7 @@ void F77_zgemm(int *order, char *transpa, char *transpb, int *m, int *n, | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -103,7 +103,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A= (CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -112,7 +112,7 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -120,14 +120,14 @@ void F77_zhemm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| } | |||
| } | |||
| LDB = *n+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| B[i*LDB+j].real=b[j*(*ldb)+i].real; | |||
| B[i*LDB+j].imag=b[j*(*ldb)+i].imag; | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -167,25 +167,25 @@ void F77_zsymm(int *order, char *rtlf, char *uplow, int *m, int *n, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| A[i*LDA+j]=a[j*(*lda)+i]; | |||
| } | |||
| LDB = *n+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) | |||
| B[i*LDB+j]=b[j*(*ldb)+i]; | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( j=0; j<*n; j++ ) | |||
| for( i=0; i<*m; i++ ) | |||
| C[i*LDC+j]=c[j*(*ldc)+i]; | |||
| @@ -221,7 +221,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -230,7 +230,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -238,7 +238,7 @@ void F77_zherk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -277,7 +277,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -286,7 +286,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*k)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -294,7 +294,7 @@ void F77_zsyrk(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -333,8 +333,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| LDB = *k+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX )); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -346,8 +346,8 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| else { | |||
| LDA = *n+1; | |||
| LDB = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc( LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc( LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc( (size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -357,7 +357,7 @@ void F77_zher2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -397,8 +397,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| if (trans == CblasNoTrans) { | |||
| LDA = *k+1; | |||
| LDB = *k+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*k; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -410,8 +410,8 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| else { | |||
| LDA = *n+1; | |||
| LDB = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc(LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc(LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDA*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((size_t)LDB*(*k)*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*k; i++ ) | |||
| for( j=0; j<*n; j++ ){ | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -421,7 +421,7 @@ void F77_zsyr2k(int *order, char *uplow, char *transp, int *n, int *k, | |||
| } | |||
| } | |||
| LDC = *n+1; | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| C=(CBLAS_TEST_ZOMPLEX* )malloc( (*n)*(size_t)LDC*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| C[i*LDC+j].real=c[j*(*ldc)+i].real; | |||
| @@ -463,7 +463,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -472,7 +472,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -480,7 +480,7 @@ void F77_ztrmm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| } | |||
| } | |||
| LDB = *n+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| B[i*LDB+j].real=b[j*(*ldb)+i].real; | |||
| @@ -522,7 +522,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| if (*order == TEST_ROW_MJR) { | |||
| if (side == CblasLeft) { | |||
| LDA = *m+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc( (*m)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX ) ); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*m; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -531,7 +531,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| } | |||
| else{ | |||
| LDA = *n+1; | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| A=(CBLAS_TEST_ZOMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*n; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| A[i*LDA+j].real=a[j*(*lda)+i].real; | |||
| @@ -539,7 +539,7 @@ void F77_ztrsm(int *order, char *rtlf, char *uplow, char *transp, char *diagn, | |||
| } | |||
| } | |||
| LDB = *n+1; | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| B=(CBLAS_TEST_ZOMPLEX* )malloc((*m)*(size_t)LDB*sizeof(CBLAS_TEST_ZOMPLEX)); | |||
| for( i=0; i<*m; i++ ) | |||
| for( j=0; j<*n; j++ ) { | |||
| B[i*LDB+j].real=b[j*(*ldb)+i].real; | |||
| @@ -1,3 +1,4 @@ | |||
| #include "cblas_test.h" | |||
| int CBLAS_CallFromC; | |||
| int RowMajorStrg; | |||
| @@ -81,6 +81,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("gbmv_thread.c" "TRANSA" "gbmv_thread_t" false "" "" false ${float_type}) | |||
| endif () | |||
| # special defines for complex | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| foreach (u_source ${U_SOURCES}) | |||
| @@ -197,6 +198,13 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| endif () | |||
| endforeach () | |||
| if (BUILD_BFLOAT16) | |||
| if (USE_THREAD) | |||
| GenerateNamedObjects("sbgemv_thread.c" "" "gemv_thread_n" false "" "" false "BFLOAT16") | |||
| GenerateNamedObjects("sbgemv_thread.c" "TRANSA" "gemv_thread_t" false "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| if ( BUILD_COMPLEX AND NOT BUILD_SINGLE) | |||
| if (USE_THREAD) | |||
| GenerateNamedObjects("gemv_thread.c" "" "gemv_thread_n" false "" "" false "SINGLE") | |||
| @@ -64,9 +64,9 @@ CBLASOBJS += \ | |||
| chpmv_U.$(SUFFIX) chpmv_L.$(SUFFIX) chpmv_V.$(SUFFIX) chpmv_M.$(SUFFIX) \ | |||
| chpr_U.$(SUFFIX) chpr_L.$(SUFFIX) chpr_V.$(SUFFIX) chpr_M.$(SUFFIX) \ | |||
| chpr2_U.$(SUFFIX) chpr2_L.$(SUFFIX) chpr2_V.$(SUFFIX) chpr2_M.$(SUFFIX) \ | |||
| csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | |||
| cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ | |||
| csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ | |||
| csbmv_U.$(SUFFIX) csbmv_L.$(SUFFIX) \ | |||
| cspr2_U.$(SUFFIX) cspr2_L.$(SUFFIX) \ | |||
| csyr2_U.$(SUFFIX) csyr2_L.$(SUFFIX) \ | |||
| ctbmv_NUU.$(SUFFIX) ctbmv_NUN.$(SUFFIX) ctbmv_NLU.$(SUFFIX) ctbmv_NLN.$(SUFFIX) \ | |||
| ctbmv_TUU.$(SUFFIX) ctbmv_TUN.$(SUFFIX) ctbmv_TLU.$(SUFFIX) ctbmv_TLN.$(SUFFIX) \ | |||
| ctbmv_RUU.$(SUFFIX) ctbmv_RUN.$(SUFFIX) ctbmv_RLU.$(SUFFIX) ctbmv_RLN.$(SUFFIX) \ | |||
| @@ -92,6 +92,13 @@ CBLASOBJS += \ | |||
| ctrsv_RUU.$(SUFFIX) ctrsv_RUN.$(SUFFIX) ctrsv_RLU.$(SUFFIX) ctrsv_RLN.$(SUFFIX) \ | |||
| ctrsv_CUU.$(SUFFIX) ctrsv_CUN.$(SUFFIX) ctrsv_CLU.$(SUFFIX) ctrsv_CLN.$(SUFFIX) | |||
| ifndef NO_LAPACK | |||
| CBLASOBJS += \ | |||
| cspmv_U.$(SUFFIX) cspmv_L.$(SUFFIX) \ | |||
| cspr_U.$(SUFFIX) cspr_L.$(SUFFIX) \ | |||
| csyr_U.$(SUFFIX) csyr_L.$(SUFFIX) | |||
| endif | |||
| ZBLASOBJS += \ | |||
| zgbmv_n.$(SUFFIX) zgbmv_t.$(SUFFIX) zgbmv_r.$(SUFFIX) zgbmv_c.$(SUFFIX) \ | |||
| zgbmv_o.$(SUFFIX) zgbmv_u.$(SUFFIX) zgbmv_s.$(SUFFIX) zgbmv_d.$(SUFFIX) \ | |||
| @@ -12,6 +12,12 @@ foreach (GEMM_DEFINE ${GEMM_DEFINES}) | |||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0) | |||
| endif () | |||
| if (BUILD_BFLOAT16) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE}" "gemm_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||
| if (USE_THREAD AND NOT USE_SIMPLE_THREADED_LEVEL3) | |||
| GenerateNamedObjects("gemm.c" "${GEMM_DEFINE};THREADED_LEVEL3" "gemm_thread_${GEMM_DEFINE_LC}" 0 "" "" false "BFLOAT16") | |||
| endif () | |||
| endif () | |||
| endforeach () | |||
| if ( BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) | |||
| @@ -425,7 +425,7 @@ cgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -473,7 +473,7 @@ zgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -521,7 +521,7 @@ xgemm_rr.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_rc.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_cn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -632,7 +632,7 @@ cgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -680,7 +680,7 @@ zgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -728,7 +728,7 @@ xgemm_thread_rr.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_thread_rc.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_thread_cn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1895,7 +1895,7 @@ cgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1943,7 +1943,7 @@ zgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -1991,7 +1991,7 @@ xgemm3m_rr.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_rc.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_cn.$(SUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2048,7 +2048,7 @@ cgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2096,7 +2096,7 @@ zgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2144,7 +2144,7 @@ xgemm3m_thread_rr.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_thread_rc.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_thread_cn.$(SUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2817,7 +2817,7 @@ cgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2865,7 +2865,7 @@ zgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -2913,7 +2913,7 @@ xgemm_rr.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_rc.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_cn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3025,7 +3025,7 @@ cgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3073,7 +3073,7 @@ zgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -3121,7 +3121,7 @@ xgemm_thread_rr.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm_thread_rc.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm_thread_cn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4288,7 +4288,7 @@ cgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4336,7 +4336,7 @@ zgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4384,7 +4384,7 @@ xgemm3m_rr.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_rc.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_cn.$(PSUFFIX) : gemm3m.c gemm3m_level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4441,7 +4441,7 @@ cgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| cgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| cgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4489,7 +4489,7 @@ zgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| zgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| zgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -4537,7 +4537,7 @@ xgemm3m_thread_rr.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRR $< -o $(@F) | |||
| xgemm3m_thread_rc.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC $< -o $(@F) | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DRC=RC $< -o $(@F) | |||
| xgemm3m_thread_cn.$(PSUFFIX) : gemm3m.c level3_gemm3m_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DXDOUBLE -DCOMPLEX -DCN $< -o $(@F) | |||
| @@ -333,14 +333,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| #else | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
| /* | |||
| if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
| else | |||
| */ | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #endif | |||
| @@ -367,14 +367,16 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Split local region of B into parts */ | |||
| for(jjs = js; jjs < MIN(n_to, js + div_n); jjs += min_jj){ | |||
| min_jj = MIN(n_to, js + div_n) - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | |||
| else | |||
| /* | |||
| if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | |||
| else | |||
| */ | |||
| if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | |||
| #endif | |||
| /* Copy part of local region of B into workspace */ | |||
| @@ -138,7 +138,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -215,7 +215,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -320,7 +320,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -399,7 +399,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -122,7 +122,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < ls - js; jjs += min_jj){ | |||
| min_jj = ls - js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -146,7 +146,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -203,7 +203,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -258,7 +258,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < min_l; jjs += min_jj){ | |||
| min_jj = min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -283,7 +283,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = 0; jjs < js - ls - min_l; jjs += min_jj){ | |||
| min_jj = js - ls - min_l - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -344,7 +344,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO | |||
| for(jjs = js; jjs < js + min_j; jjs += min_jj){ | |||
| min_jj = min_j + js - jjs; | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ | |||
| if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | |||
| #else | |||
| @@ -49,6 +49,8 @@ GenerateNamedObjects("openblas_get_config.c;openblas_get_parallel.c" "" "" 0 "" | |||
| if (DYNAMIC_ARCH) | |||
| if (ARM64) | |||
| list(APPEND COMMON_SOURCES dynamic_arm64.c) | |||
| elseif (POWER) | |||
| list(APPEND COMMON_SOURCES dynamic_power.c) | |||
| else () | |||
| list(APPEND COMMON_SOURCES dynamic.c) | |||
| endif () | |||
| @@ -24,10 +24,14 @@ else | |||
| ifeq ($(ARCH),zarch) | |||
| COMMONOBJS += dynamic_zarch.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),mips64) | |||
| COMMONOBJS += dynamic_mips64.$(SUFFIX) | |||
| else | |||
| COMMONOBJS += dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| COMMONOBJS += parameter.$(SUFFIX) | |||
| endif | |||
| @@ -92,10 +96,14 @@ else | |||
| ifeq ($(ARCH),zarch) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) | |||
| else | |||
| ifeq ($(ARCH),mips64) | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) | |||
| endif | |||
| endif | |||
| endif | |||
| endif | |||
| else | |||
| HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) | |||
| endif | |||
| @@ -209,7 +209,8 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, | |||
| double *, BLASLONG, double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -220,7 +221,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*) | |||
| (BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -232,7 +236,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BFLOAT16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| bfloat16 *, BLASLONG, void *) = func; | |||
| bfloat16 *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, bfloat16, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, | |||
| bfloat16 *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((bfloat16 *)args -> alpha)[0], | |||
| @@ -243,7 +249,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BLAS_STOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, bfloat16 *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -254,7 +262,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* REAL / BLAS_DTOBF16 */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, | |||
| double *, BLASLONG, bfloat16 *, BLASLONG, | |||
| double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -271,7 +281,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Extended Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| xdouble *, BLASLONG, void *) = func; | |||
| xdouble *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, | |||
| xdouble *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((xdouble *)args -> alpha)[0], | |||
| @@ -285,7 +297,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Double */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *) = func; | |||
| double *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, double, double, | |||
| double *, BLASLONG, double *, BLASLONG, | |||
| double *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((double *)args -> alpha)[0], | |||
| @@ -297,7 +311,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| /* COMPLEX / Single */ | |||
| void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *) = func; | |||
| float *, BLASLONG, void *) = (void (*)(BLASLONG, BLASLONG, BLASLONG, float, float, | |||
| float *, BLASLONG, float *, BLASLONG, | |||
| float *, BLASLONG, void *)) func; | |||
| afunc(args -> m, args -> n, args -> k, | |||
| ((float *)args -> alpha)[0], | |||
| @@ -425,7 +441,7 @@ blas_queue_t *tscq; | |||
| #endif | |||
| if (queue) { | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine; | |||
| int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = (int (*)(blas_arg_t *, void *, void *, void *, void *, BLASLONG))queue -> routine; | |||
| atomic_store_queue(&thread_status[cpu].queue, (blas_queue_t *)1); | |||
| @@ -503,7 +519,7 @@ blas_queue_t *tscq; | |||
| legacy_exec(routine, queue -> mode, queue -> args, sb); | |||
| } else | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| void (*pthreadcompat)(void *) = (void(*)(void*))queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position); | |||
| @@ -871,13 +887,13 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ | |||
| fprintf(STDERR, "\n"); | |||
| #endif | |||
| routine = queue -> routine; | |||
| routine = (int (*)(blas_arg_t *, void *, void *, double *, double *, BLASLONG))queue -> routine; | |||
| if (queue -> mode & BLAS_LEGACY) { | |||
| legacy_exec(routine, queue -> mode, queue -> args, queue -> sb); | |||
| } else | |||
| if (queue -> mode & BLAS_PTHREAD) { | |||
| void (*pthreadcompat)(void *) = queue -> routine; | |||
| void (*pthreadcompat)(void *) = (void (*)(void*))queue -> routine; | |||
| (pthreadcompat)(queue -> args); | |||
| } else | |||
| (routine)(queue -> args, queue -> range_m, queue -> range_n, | |||
| @@ -967,9 +983,11 @@ void goto_set_num_threads(int num_threads) { | |||
| blas_cpu_number = num_threads; | |||
| #if defined(ARCH_MIPS64) | |||
| #ifndef DYNAMIC_ARCH | |||
| //set parameters for different number of threads. | |||
| blas_set_parameter(); | |||
| #endif | |||
| #endif | |||
| } | |||
| @@ -1022,38 +1040,39 @@ int BLASFUNC(blas_thread_shutdown)(void){ | |||
| int i; | |||
| if (!blas_server_avail) return 0; | |||
| LOCK_COMMAND(&server_lock); | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| if (blas_server_avail) { | |||
| for (i = 0; i < blas_num_threads - 1; i++) { | |||
| pthread_mutex_lock (&thread_status[i].lock); | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| pthread_cond_signal (&thread_status[i].wakeup); | |||
| pthread_mutex_lock (&thread_status[i].lock); | |||
| pthread_mutex_unlock(&thread_status[i].lock); | |||
| atomic_store_queue(&thread_status[i].queue, (blas_queue_t *)-1); | |||
| thread_status[i].status = THREAD_STATUS_WAKEUP; | |||
| pthread_cond_signal (&thread_status[i].wakeup); | |||
| } | |||
| pthread_mutex_unlock(&thread_status[i].lock); | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_join(blas_threads[i], NULL); | |||
| } | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_mutex_destroy(&thread_status[i].lock); | |||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_join(blas_threads[i], NULL); | |||
| } | |||
| for(i = 0; i < blas_num_threads - 1; i++){ | |||
| pthread_mutex_destroy(&thread_status[i].lock); | |||
| pthread_cond_destroy (&thread_status[i].wakeup); | |||
| } | |||
| #ifdef NEED_STACKATTR | |||
| pthread_attr_destory(&attr); | |||
| pthread_attr_destroy(&attr); | |||
| #endif | |||
| blas_server_avail = 0; | |||
| blas_server_avail = 0; | |||
| } | |||
| UNLOCK_COMMAND(&server_lock); | |||
| return 0; | |||
| @@ -40,7 +40,7 @@ | |||
| #include <stdlib.h> | |||
| #include "common.h" | |||
| #if defined(OS_CYGWIN_NT) && !defined(unlikely) | |||
| #if !defined(unlikely) | |||
| #ifdef __GNUC__ | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #else | |||
| @@ -391,8 +391,9 @@ int blas_thread_init(void){ | |||
| int exec_blas_async(BLASLONG pos, blas_queue_t *queue){ | |||
| #if defined(SMP_SERVER) && defined(OS_CYGWIN_NT) | |||
| #if defined(SMP_SERVER) | |||
| // Handle lazy re-init of the thread-pool after a POSIX fork | |||
| // on Cygwin or as delayed init when a static library is used | |||
| if (unlikely(blas_server_avail == 0)) blas_thread_init(); | |||
| #endif | |||
| @@ -292,6 +292,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; | |||
| #define VENDOR_AMD 2 | |||
| #define VENDOR_CENTAUR 3 | |||
| #define VENDOR_HYGON 4 | |||
| #define VENDOR_ZHAOXIN 5 | |||
| #define VENDOR_UNKNOWN 99 | |||
| #define BITMASK(a, b, c) ((((a) >> (b)) & (c))) | |||
| @@ -404,6 +405,7 @@ static int get_vendor(void){ | |||
| if (!strcmp(vendor.vchar, "GenuineIntel")) return VENDOR_INTEL; | |||
| if (!strcmp(vendor.vchar, "AuthenticAMD")) return VENDOR_AMD; | |||
| if (!strcmp(vendor.vchar, "CentaurHauls")) return VENDOR_CENTAUR; | |||
| if (!strcmp(vendor.vchar, " Shanghai ")) return VENDOR_ZHAOXIN; | |||
| if (!strcmp(vendor.vchar, "HygonGenuine")) return VENDOR_HYGON; | |||
| if ((eax == 0) || ((eax & 0x500) != 0)) return VENDOR_INTEL; | |||
| @@ -414,7 +416,7 @@ static int get_vendor(void){ | |||
| static gotoblas_t *get_coretype(void){ | |||
| int eax, ebx, ecx, edx; | |||
| int family, exfamily, model, vendor, exmodel; | |||
| int family, exfamily, model, vendor, exmodel, stepping; | |||
| cpuid(1, &eax, &ebx, &ecx, &edx); | |||
| @@ -422,6 +424,7 @@ static gotoblas_t *get_coretype(void){ | |||
| exfamily = BITMASK(eax, 20, 0xff); | |||
| model = BITMASK(eax, 4, 0x0f); | |||
| exmodel = BITMASK(eax, 16, 0x0f); | |||
| stepping = BITMASK(eax, 0, 0x0f); | |||
| vendor = get_vendor(); | |||
| @@ -621,11 +624,27 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 10 || model == 12){ | |||
| // Ice Lake SP | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 7: | |||
| if (model == 10) // Goldmont Plus | |||
| return &gotoblas_NEHALEM; | |||
| if (model == 14) { | |||
| if (model == 13 || model == 14) { | |||
| // Ice Lake | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| @@ -642,8 +661,68 @@ static gotoblas_t *get_coretype(void){ | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| case 8: | |||
| if (model == 12 || model == 13) { // Tiger Lake | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| openblas_warning(FALLBACK_VERBOSE, HASWELL_FALLBACK); | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| if (model == 15){ // Sapphire Rapids | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| return NULL; | |||
| case 9: | |||
| if (model == 7 || model == 10) { // Alder Lake | |||
| if(support_avx512_bf16()) | |||
| return &gotoblas_COOPERLAKE; | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()){ | |||
| return &gotoblas_HASWELL; | |||
| } | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (model == 14 ) { // Kaby Lake, Coffee Lake | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| @@ -655,8 +734,9 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| @@ -666,7 +746,20 @@ static gotoblas_t *get_coretype(void){ | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| } | |||
| if (model == 7) { | |||
| if (support_avx512()) | |||
| return &gotoblas_SKYLAKEX; | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| @@ -779,10 +872,19 @@ static gotoblas_t *get_coretype(void){ | |||
| if (vendor == VENDOR_CENTAUR) { | |||
| switch (family) { | |||
| case 0x6: | |||
| return &gotoblas_NANO; | |||
| if (model == 0xf && stepping < 0xe) | |||
| return &gotoblas_NANO; | |||
| return &gotoblas_NEHALEM; | |||
| default: | |||
| if (family >= 0x7) | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| } | |||
| if (vendor == VENDOR_ZHAOXIN) { | |||
| return &gotoblas_NEHALEM; | |||
| } | |||
| return NULL; | |||
| } | |||
| @@ -962,7 +1064,13 @@ void gotoblas_dynamic_init(void) { | |||
| #ifdef ARCH_X86 | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_KATMAI; | |||
| #else | |||
| if (gotoblas == NULL) gotoblas = &gotoblas_PRESCOTT; | |||
| if (gotoblas == NULL) { | |||
| if (support_avx512_bf16()) gotoblas = &gotoblas_COOPERLAKE; | |||
| else if (support_avx512()) gotoblas = &gotoblas_SKYLAKEX; | |||
| else if (support_avx2()) gotoblas = &gotoblas_HASWELL; | |||
| else if (support_avx()) gotoblas = &gotoblas_SANDYBRIDGE; | |||
| else gotoblas = &gotoblas_PRESCOTT; | |||
| } | |||
| /* sanity check, if 64bit pointer we can't have a 32 bit cpu */ | |||
| if (sizeof(void*) == 8) { | |||
| if (gotoblas == &gotoblas_KATMAI || | |||
| @@ -43,6 +43,68 @@ | |||
| #endif | |||
| extern gotoblas_t gotoblas_ARMV8; | |||
| #ifdef DYNAMIC_LIST | |||
| #ifdef DYN_CORTEXA53 | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| #else | |||
| #define gotoblas_CORTEXA53 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA57 | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| #else | |||
| #define gotoblas_CORTEXA57 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA72 | |||
| extern gotoblas_t gotoblas_CORTEXA72; | |||
| #else | |||
| #define gotoblas_CORTEXA72 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEXA73 | |||
| extern gotoblas_t gotoblas_CORTEXA73; | |||
| #else | |||
| #define gotoblas_CORTEXA73 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_FALKOR | |||
| extern gotoblas_t gotoblas_FALKOR; | |||
| #else | |||
| #define gotoblas_FALKOR gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_TSV110 | |||
| extern gotoblas_t gotoblas_TSV110; | |||
| #else | |||
| #define gotoblas_TSV110 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX | |||
| extern gotoblas_t gotoblas_THUNDERX; | |||
| #else | |||
| #define gotoblas_THUNDERX gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX2T99 | |||
| extern gotoblas_t gotoblas_THUNDERX2T99; | |||
| #else | |||
| #define gotoblas_THUNDERX2T99 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_THUNDERX3T110 | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| #else | |||
| #define gotoblas_THUNDERX3T110 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_EMAG8180 | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| #else | |||
| #define gotoblas_EMAG8180 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_NEOVERSEN1 | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| #else | |||
| #define gotoblas_NEOVERSEN1 gotoblas_ARMV8 | |||
| #endif | |||
| #ifdef DYN_CORTEX_A55 | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #else | |||
| #define gotoblas_CORTEXA55 gotoblas_ARMV8 | |||
| #endif | |||
| #else | |||
| extern gotoblas_t gotoblas_CORTEXA53; | |||
| extern gotoblas_t gotoblas_CORTEXA57; | |||
| extern gotoblas_t gotoblas_CORTEXA72; | |||
| @@ -54,10 +116,12 @@ extern gotoblas_t gotoblas_TSV110; | |||
| extern gotoblas_t gotoblas_EMAG8180; | |||
| extern gotoblas_t gotoblas_NEOVERSEN1; | |||
| extern gotoblas_t gotoblas_THUNDERX3T110; | |||
| extern gotoblas_t gotoblas_CORTEXA55; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 12 | |||
| #define NUM_CORETYPES 13 | |||
| /* | |||
| * In case asm/hwcap.h is outdated on the build system, make sure | |||
| @@ -68,7 +132,7 @@ extern void openblas_warning(int verbose, const char * msg); | |||
| #endif | |||
| #define get_cpu_ftr(id, var) ({ \ | |||
| __asm__("mrs %0, "#id : "=r" (var)); \ | |||
| __asm__ __volatile__ ("mrs %0, "#id : "=r" (var)); \ | |||
| }) | |||
| static char *corename[] = { | |||
| @@ -83,7 +147,10 @@ static char *corename[] = { | |||
| "tsv110", | |||
| "emag8180", | |||
| "neoversen1", | |||
| "neoversev1", | |||
| "neoversen2", | |||
| "thunderx3t110", | |||
| "cortexa55", | |||
| "unknown" | |||
| }; | |||
| @@ -100,6 +167,7 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_EMAG8180) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_NEOVERSEN1) return corename[10]; | |||
| if (gotoblas == &gotoblas_THUNDERX3T110) return corename[11]; | |||
| if (gotoblas == &gotoblas_CORTEXA55) return corename[12]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| @@ -131,6 +199,7 @@ static gotoblas_t *force_coretype(char *coretype) { | |||
| case 9: return (&gotoblas_EMAG8180); | |||
| case 10: return (&gotoblas_NEOVERSEN1); | |||
| case 11: return (&gotoblas_THUNDERX3T110); | |||
| case 12: return (&gotoblas_CORTEXA55); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| @@ -189,6 +258,8 @@ static gotoblas_t *get_coretype(void) { | |||
| return &gotoblas_CORTEXA73; | |||
| case 0xd0c: // Neoverse N1 | |||
| return &gotoblas_NEOVERSEN1; | |||
| case 0xd05: // Cortex A55 | |||
| return &gotoblas_CORTEXA55; | |||
| } | |||
| break; | |||
| case 0x42: // Broadcom | |||
| @@ -0,0 +1,230 @@ | |||
| /***************************************************************************** | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written | |||
| permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include <sys/wait.h> | |||
| #include <stdio.h> | |||
| #include <unistd.h> | |||
| #include <stdlib.h> | |||
| #include <string.h> | |||
| #include <sys/resource.h> | |||
| #include "common.h" | |||
| extern gotoblas_t gotoblas_LOONGSON3R3; | |||
| extern gotoblas_t gotoblas_LOONGSON3R4; | |||
| extern void openblas_warning(int verbose, const char * msg); | |||
| #define NUM_CORETYPES 2 | |||
| static char *corename[] = { | |||
| "loongson3r3", | |||
| "loongson3r4", | |||
| "UNKNOWN" | |||
| }; | |||
| char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; | |||
| if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; | |||
| return corename[NUM_CORETYPES]; | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype) { | |||
| int i; | |||
| int found = -1; | |||
| char message[128]; | |||
| for ( i=0 ; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| switch (found) | |||
| { | |||
| case 0: return (&gotoblas_LOONGSON3R3); | |||
| case 1: return (&gotoblas_LOONGSON3R4); | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| openblas_warning(1, message); | |||
| return NULL; | |||
| } | |||
| #define MMI_MASK 0x00000010 | |||
| #define MSA_MASK 0x00000020 | |||
| int fd[2]; | |||
| int support_cpucfg; | |||
| static void handler(int signum) | |||
| { | |||
| close(fd[1]); | |||
| exit(1); | |||
| } | |||
| /* Brief : Function to check if cpucfg supported on loongson | |||
| * Return: 1 supported | |||
| * 0 not supported | |||
| */ | |||
| static int cpucfg_test(void) { | |||
| pid_t pid; | |||
| int status = 0; | |||
| support_cpucfg = 0; | |||
| pipe(fd); | |||
| pid = fork(); | |||
| if (pid == 0) { /* Subprocess */ | |||
| struct sigaction act; | |||
| close(fd[0]); | |||
| /* Set signal action for SIGILL. */ | |||
| act.sa_handler = handler; | |||
| sigaction(SIGILL,&act,NULL); | |||
| /* Execute cpucfg in subprocess. */ | |||
| __asm__ volatile( | |||
| ".insn \n\t" | |||
| ".word (0xc8080118) \n\t" | |||
| ::: | |||
| ); | |||
| support_cpucfg = 1; | |||
| write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); | |||
| close(fd[1]); | |||
| exit(0); | |||
| } else if (pid > 0){ /* Parent process*/ | |||
| close(fd[1]); | |||
| if ((waitpid(pid,&status,0) <= 0) || | |||
| (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) | |||
| support_cpucfg = 0; | |||
| close(fd[0]); | |||
| } else { | |||
| support_cpucfg = 0; | |||
| } | |||
| return support_cpucfg; | |||
| } | |||
| static gotoblas_t *get_coretype_from_cpucfg(void) { | |||
| int flag = 0; | |||
| __asm__ volatile( | |||
| ".insn \n\t" | |||
| "dli $8, 0x01 \n\t" | |||
| ".word (0xc9084918) \n\t" | |||
| "usw $9, 0x00(%0) \n\t" | |||
| : | |||
| : "r"(&flag) | |||
| : "memory" | |||
| ); | |||
| if (flag & MSA_MASK) | |||
| return (&gotoblas_LOONGSON3R4); | |||
| if (flag & MMI_MASK) | |||
| return (&gotoblas_LOONGSON3R3); | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *get_coretype_from_cpuinfo(void) { | |||
| #ifdef linux | |||
| FILE *infile; | |||
| char buffer[512], *p; | |||
| p = (char *)NULL; | |||
| //Check model name for Loongson3 | |||
| infile = fopen("/proc/cpuinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)){ | |||
| if (!strncmp("model name", buffer, 10)){ | |||
| p = strchr(buffer, ':') + 2; | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| if(p != NULL){ | |||
| if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) | |||
| return (&gotoblas_LOONGSON3R3); | |||
| else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) | |||
| return (&gotoblas_LOONGSON3R4); | |||
| else | |||
| return NULL; | |||
| } | |||
| #endif | |||
| return NULL; | |||
| } | |||
| static gotoblas_t *get_coretype(void) { | |||
| int ret = 0; | |||
| ret = cpucfg_test(); | |||
| if (ret == 1) | |||
| return get_coretype_from_cpucfg(); | |||
| else | |||
| return get_coretype_from_cpuinfo(); | |||
| } | |||
| void gotoblas_dynamic_init(void) { | |||
| char coremsg[128]; | |||
| char coren[22]; | |||
| char *p; | |||
| if (gotoblas) return; | |||
| p = getenv("OPENBLAS_CORETYPE"); | |||
| if ( p ) | |||
| { | |||
| gotoblas = force_coretype(p); | |||
| } | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| } | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_LOONGSON3R3; | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| gotoblas -> init(); | |||
| } else { | |||
| openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); | |||
| exit(1); | |||
| } | |||
| } | |||
| void gotoblas_dynamic_quit(void) { | |||
| gotoblas = NULL; | |||
| } | |||
| @@ -6,10 +6,6 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| //#if (!defined __GNUC__) || ( __GNUC__ >= 11) \ | |||
| // || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| //#define HAVE_P10_SUPPORT 1 | |||
| //#endif | |||
| #ifdef HAVE_P10_SUPPORT | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| @@ -27,7 +23,9 @@ static char *corename[] = { | |||
| #define NUM_CORETYPES 4 | |||
| char *gotoblas_corename(void) { | |||
| #ifndef C_PGI | |||
| if (gotoblas == &gotoblas_POWER6) return corename[1]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| @@ -38,10 +36,164 @@ char *gotoblas_corename(void) { | |||
| return corename[0]; | |||
| } | |||
| #if defined(__clang__) | |||
| static int __builtin_cpu_supports(char* arg) | |||
| { | |||
| return 0; | |||
| } | |||
| #endif | |||
| #if defined(C_PGI) || defined(__clang__) | |||
| /* | |||
| * NV HPC compilers do not yet implement __builtin_cpu_is(). | |||
| * Fake a version here for use in the CPU detection code below. | |||
| * | |||
| * Strategy here is to first check the CPU to see what it actually is, | |||
| * and then test the input to see if what the CPU actually is matches | |||
| * what was requested. | |||
| */ | |||
| #include <string.h> | |||
| /* | |||
| * Define POWER processor version table. | |||
| * | |||
| * NOTE NV HPC SDK compilers only support POWER8 and POWER9 at this time | |||
| */ | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_POWER5 5 | |||
| #define CPU_POWER6 6 | |||
| #define CPU_POWER8 8 | |||
| #define CPU_POWER9 9 | |||
| #define CPU_POWER10 10 | |||
| static struct { | |||
| uint32_t pvr_mask; | |||
| uint32_t pvr_value; | |||
| const char* cpu_name; | |||
| uint32_t cpu_type; | |||
| } pvrPOWER [] = { | |||
| { /* POWER6 in P5+ mode; 2.04-compliant processor */ | |||
| .pvr_mask = 0xffffffff, | |||
| .pvr_value = 0x0f000001, | |||
| .cpu_name = "POWER5+", | |||
| .cpu_type = CPU_POWER5, | |||
| }, | |||
| { /* Power6 aka POWER6X*/ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003e0000, | |||
| .cpu_name = "POWER6 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x003f0000, | |||
| .cpu_name = "POWER7 (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power7+ */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004A0000, | |||
| .cpu_name = "POWER7+ (raw)", | |||
| .cpu_type = CPU_POWER6, | |||
| }, | |||
| { /* Power8E */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004b0000, | |||
| .cpu_name = "POWER8E (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8NVL */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004c0000, | |||
| .cpu_name = "POWER8NVL (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power8 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004d0000, | |||
| .cpu_name = "POWER8 (raw)", | |||
| .cpu_type = CPU_POWER8, | |||
| }, | |||
| { /* Power9 DD2.0 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0200, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD 2.1 */ | |||
| .pvr_mask = 0xffffefff, | |||
| .pvr_value = 0x004e0201, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power9 DD2.2 or later */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x004e0000, | |||
| .cpu_name = "POWER9 (raw)", | |||
| .cpu_type = CPU_POWER9, | |||
| }, | |||
| { /* Power10 */ | |||
| .pvr_mask = 0xffff0000, | |||
| .pvr_value = 0x00800000, | |||
| .cpu_name = "POWER10 (raw)", | |||
| .cpu_type = CPU_POWER10, | |||
| }, | |||
| { /* End of table, pvr_mask and pvr_value must be zero */ | |||
| .pvr_mask = 0x0, | |||
| .pvr_value = 0x0, | |||
| .cpu_name = "Unknown", | |||
| .cpu_type = CPU_UNKNOWN, | |||
| }, | |||
| }; | |||
| static int __builtin_cpu_is(const char *cpu) { | |||
| int i; | |||
| uint32_t pvr; | |||
| uint32_t cpu_type; | |||
| asm("mfpvr %0" : "=r"(pvr)); | |||
| for (i = 0 ; i < sizeof pvrPOWER / sizeof *pvrPOWER ; ++i) { | |||
| if ((pvr & pvrPOWER[i].pvr_mask) == pvrPOWER[i].pvr_value) { | |||
| break; | |||
| } | |||
| } | |||
| #if defined(DEBUG) | |||
| printf("%s: returning CPU=%s, cpu_type=%p\n", __func__, | |||
| pvrPOWER[i].cpu_name, pvrPOWER[i].cpu_type); | |||
| #endif | |||
| cpu_type = pvrPOWER[i].cpu_type; | |||
| if (!strcmp(cpu, "power8")) | |||
| return cpu_type == CPU_POWER8; | |||
| if (!strcmp(cpu, "power9")) | |||
| return cpu_type == CPU_POWER9; | |||
| return 0; | |||
| } | |||
| #endif /* C_PGI */ | |||
| static gotoblas_t *get_coretype(void) { | |||
| #ifndef C_PGI | |||
| if (__builtin_cpu_is("power6") || __builtin_cpu_is("power6x")) | |||
| return &gotoblas_POWER6; | |||
| #endif | |||
| if (__builtin_cpu_is("power8")) | |||
| return &gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| @@ -52,6 +204,11 @@ static gotoblas_t *get_coretype(void) { | |||
| if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) | |||
| return &gotoblas_POWER10; | |||
| #endif | |||
| /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) || (__GNUC__ == 10 && __GNUC_MINOR__ >= 2) | |||
| if (__builtin_cpu_is("power10")) | |||
| return &gotoblas_POWER9; | |||
| #endif | |||
| return NULL; | |||
| } | |||
| @@ -72,7 +229,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| switch (found) | |||
| { | |||
| #ifndef C_PGI | |||
| case 1: return (&gotoblas_POWER6); | |||
| #endif | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| @@ -1,38 +1,7 @@ | |||
| #include "common.h" | |||
| #include "cpuid_zarch.h" | |||
| #include <stdbool.h> | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
| #ifdef DYN_Z13 | |||
| @@ -44,25 +13,19 @@ extern gotoblas_t gotoblas_Z14; | |||
| #define NUM_CORETYPES 4 | |||
| extern int openblas_verbose(); | |||
| extern void openblas_warning(int verbose, const char* msg); | |||
| static char* corename[] = { | |||
| "unknown", | |||
| "Z13", | |||
| "Z14", | |||
| "ZARCH_GENERIC", | |||
| }; | |||
| char* gotoblas_corename(void) { | |||
| #ifdef DYN_Z13 | |||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
| if (gotoblas == &gotoblas_Z13) return cpuname[CPU_Z13]; | |||
| #endif | |||
| #ifdef DYN_Z14 | |||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
| if (gotoblas == &gotoblas_Z14) return cpuname[CPU_Z14]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return cpuname[CPU_GENERIC]; | |||
| return corename[0]; | |||
| return "unknown"; | |||
| } | |||
| #ifndef HWCAP_S390_VXE | |||
| @@ -79,25 +42,28 @@ char* gotoblas_corename(void) { | |||
| */ | |||
| static gotoblas_t* get_coretype(void) { | |||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
| int cpu = detect(); | |||
| #ifdef DYN_Z14 | |||
| switch(cpu) { | |||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| case CPU_Z14: | |||
| #ifdef DYN_Z14 | |||
| return &gotoblas_Z14; | |||
| #endif | |||
| #ifdef DYN_Z13 | |||
| // z13: Vector Facility (SIMD for double) | |||
| if (hwcap & HWCAP_S390_VX) | |||
| case CPU_Z13: | |||
| #ifdef DYN_Z13 | |||
| return &gotoblas_Z13; | |||
| #endif | |||
| default: | |||
| // fallback in case of missing compiler support, systems before z13, or | |||
| // when the OS does not advertise support for the Vector Facility (e.g., | |||
| // missing support in the OS kernel) | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| @@ -108,28 +74,28 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
| for (i = 0; i < NUM_CORETYPES; i++) | |||
| { | |||
| if (!strncasecmp(coretype, corename[i], 20)) | |||
| if (!strncasecmp(coretype, cpuname[i], 20)) | |||
| { | |||
| found = i; | |||
| break; | |||
| } | |||
| } | |||
| if (found == 1) { | |||
| if (found == CPU_Z13) { | |||
| #ifdef DYN_Z13 | |||
| return &gotoblas_Z13; | |||
| #else | |||
| openblas_warning(1, "Z13 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| } else if (found == 2) { | |||
| } else if (found == CPU_Z14) { | |||
| #ifdef DYN_Z14 | |||
| return &gotoblas_Z14; | |||
| #else | |||
| openblas_warning(1, "Z14 support not compiled in"); | |||
| return NULL; | |||
| #endif | |||
| } else if (found == 3) { | |||
| } else if (found == CPU_GENERIC) { | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| @@ -155,6 +121,11 @@ void gotoblas_dynamic_init(void) { | |||
| else | |||
| { | |||
| gotoblas = get_coretype(); | |||
| if (openblas_verbose() >= 2) { | |||
| snprintf(coremsg, sizeof(coremsg), "Choosing kernels based on getauxval(AT_HWCAP)=0x%lx\n", | |||
| getauxval(AT_HWCAP)); | |||
| openblas_warning(2, coremsg); | |||
| } | |||
| } | |||
| if (gotoblas == NULL) | |||
| @@ -165,9 +136,11 @@ void gotoblas_dynamic_init(void) { | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| if (openblas_verbose() >= 2) { | |||
| strncpy(coren, gotoblas_corename(), 20); | |||
| sprintf(coremsg, "Core: %s\n", coren); | |||
| openblas_warning(2, coremsg); | |||
| } | |||
| gotoblas->init(); | |||
| } | |||
| else { | |||
| @@ -73,6 +73,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #ifndef likely | |||
| #ifdef __GNUC__ | |||
| #define likely(x) __builtin_expect(!!(x), 1) | |||
| #define unlikely(x) __builtin_expect(!!(x), 0) | |||
| #else | |||
| #define likely(x) (x) | |||
| #define unlikely(x) (x) | |||
| #endif | |||
| #endif | |||
| #if defined(USE_TLS) && defined(SMP) | |||
| #define COMPILE_TLS | |||
| @@ -222,11 +232,11 @@ int get_num_procs(void); | |||
| #else | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| @@ -236,6 +246,15 @@ int get_num_procs(void) { | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if defined(USE_OPENMP) | |||
| #if _OPENMP >= 201511 | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| #endif | |||
| @@ -428,7 +447,7 @@ extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -436,7 +455,7 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| @@ -460,7 +479,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -1241,7 +1260,7 @@ UNLOCK_COMMAND(&alloc_lock); | |||
| func = &memoryalloc[0]; | |||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| @@ -1291,7 +1310,12 @@ UNLOCK_COMMAND(&alloc_lock); | |||
| return (void *)(((char *)alloc_info) + sizeof(struct alloc_t)); | |||
| error: | |||
| printf("OpenBLAS : Program will terminate because you tried to allocate too many memory regions.\n"); | |||
| printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n"); | |||
| printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||
| printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||
| printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); | |||
| printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); | |||
| printf("cpu cores than what OpenBLAS was configured to handle.\n"); | |||
| return NULL; | |||
| } | |||
| @@ -1619,10 +1643,12 @@ static int on_process_term(void) | |||
| #else | |||
| #pragma data_seg(".CRT$XLB") | |||
| #endif | |||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #ifdef _WIN64 | |||
| static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #pragma const_seg() | |||
| #else | |||
| static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain; | |||
| #pragma data_seg() | |||
| #endif | |||
| @@ -1631,10 +1657,12 @@ static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOI | |||
| #else | |||
| #pragma data_seg(".CRT$XTU") | |||
| #endif | |||
| static int(*p_process_term)(void) = on_process_term; | |||
| #ifdef _WIN64 | |||
| static const int(*p_process_term)(void) = on_process_term; | |||
| #pragma const_seg() | |||
| #else | |||
| static int(*p_process_term)(void) = on_process_term; | |||
| #pragma data_seg() | |||
| #endif | |||
| #endif | |||
| @@ -1668,16 +1696,23 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #ifndef MEM_LARGE_PAGES | |||
| #define MEM_LARGE_PAGES 0x20000000 | |||
| #endif | |||
| #else | |||
| #elif !defined(OS_EMBEDDED) | |||
| #define ALLOC_MMAP | |||
| #define ALLOC_MALLOC | |||
| #else | |||
| #define ALLOC_MALLOC | |||
| inline int puts(const char *str) { return 0; } | |||
| inline int printf(const char *format, ...) { return 0; } | |||
| inline char *getenv(const char *name) { return ""; } | |||
| inline int atoi(const char *str) { return 0; } | |||
| #endif | |||
| #include <stdlib.h> | |||
| #include <stdio.h> | |||
| #include <fcntl.h> | |||
| #if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT) | |||
| #if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED) | |||
| #include <sys/mman.h> | |||
| #ifndef NO_SYSV_IPC | |||
| #include <sys/shm.h> | |||
| @@ -1691,7 +1726,6 @@ void gotoblas_dummy_for_PGI(void) { | |||
| #include <sys/sysinfo.h> | |||
| #include <sched.h> | |||
| #include <errno.h> | |||
| #include <linux/unistd.h> | |||
| #include <sys/syscall.h> | |||
| #include <sys/time.h> | |||
| #include <sys/resource.h> | |||
| @@ -1767,11 +1801,12 @@ int get_num_procs(void); | |||
| int get_num_procs(void) { | |||
| static int nums = 0; | |||
| int ret; | |||
| #if defined(__GLIBC_PREREQ) | |||
| cpu_set_t cpuset,*cpusetp; | |||
| size_t size; | |||
| int ret; | |||
| #if !__GLIBC_PREREQ(2, 7) | |||
| int i; | |||
| #if !__GLIBC_PREREQ(2, 6) | |||
| @@ -1781,10 +1816,20 @@ int get_num_procs(void) { | |||
| #endif | |||
| if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF); | |||
| #if defined(USE_OPENMP) | |||
| /* if (omp_get_proc_bind() != omp_proc_bind_false) */ | |||
| #if _OPENMP >= 201511 | |||
| ret = omp_get_num_places(); | |||
| if (ret >0 ) nums = ret; | |||
| #endif | |||
| return nums; | |||
| #endif | |||
| #if !defined(OS_LINUX) | |||
| return nums; | |||
| #endif | |||
| #if !defined(__GLIBC_PREREQ) | |||
| return nums; | |||
| #else | |||
| @@ -1969,7 +2014,7 @@ extern int openblas_goto_num_threads_env(); | |||
| extern int openblas_omp_num_threads_env(); | |||
| int blas_get_cpu_number(void){ | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| int max_num; | |||
| #endif | |||
| int blas_goto_num = 0; | |||
| @@ -1977,7 +2022,7 @@ int blas_get_cpu_number(void){ | |||
| if (blas_num_threads) return blas_num_threads; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| max_num = get_num_procs(); | |||
| #endif | |||
| @@ -2001,7 +2046,7 @@ int blas_get_cpu_number(void){ | |||
| else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; | |||
| else blas_num_threads = MAX_CPU_NUMBER; | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) | |||
| #if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU) | |||
| if (blas_num_threads > max_num) blas_num_threads = max_num; | |||
| #endif | |||
| @@ -2045,6 +2090,7 @@ struct release_t { | |||
| int hugetlb_allocated = 0; | |||
| static struct release_t release_info[NUM_BUFFERS]; | |||
| static struct release_t *new_release_info; | |||
| static int release_pos = 0; | |||
| #if defined(OS_LINUX) && !defined(NO_WARMUP) | |||
| @@ -2095,8 +2141,13 @@ static void *alloc_mmap(void *address){ | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; | |||
| } | |||
| release_pos ++; | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| @@ -2259,8 +2310,13 @@ static void *alloc_mmap(void *address){ | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_mmap_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_mmap_free; | |||
| } | |||
| release_pos ++; | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| @@ -2292,8 +2348,13 @@ static void *alloc_malloc(void *address){ | |||
| if (map_address == (void *)NULL) map_address = (void *)-1; | |||
| if (map_address != (void *)-1) { | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_malloc_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_malloc_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2326,8 +2387,13 @@ static void *alloc_qalloc(void *address){ | |||
| if (map_address == (void *)NULL) map_address = (void *)-1; | |||
| if (map_address != (void *)-1) { | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_qalloc_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_qalloc_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2355,8 +2421,13 @@ static void *alloc_windows(void *address){ | |||
| if (map_address == (void *)NULL) map_address = (void *)-1; | |||
| if (map_address != (void *)-1) { | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_windows_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_windows_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2399,9 +2470,15 @@ static void *alloc_devicedirver(void *address){ | |||
| fd, 0); | |||
| if (map_address != (void *)-1) { | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].attr = fd; | |||
| release_info[release_pos].func = alloc_devicedirver_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].attr = fd; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_devicedirver_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2435,9 +2512,15 @@ static void *alloc_shm(void *address){ | |||
| shmctl(shmid, IPC_RMID, 0); | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].attr = shmid; | |||
| release_info[release_pos].func = alloc_shm_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].attr = shmid; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_shm_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2541,8 +2624,13 @@ static void *alloc_hugetlb(void *address){ | |||
| #endif | |||
| if (map_address != (void *)-1){ | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].func = alloc_hugetlb_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlb_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2589,9 +2677,15 @@ static void *alloc_hugetlbfile(void *address){ | |||
| fd, 0); | |||
| if (map_address != (void *)-1) { | |||
| if (likely(release_pos < NUM_BUFFERS)) { | |||
| release_info[release_pos].address = map_address; | |||
| release_info[release_pos].attr = fd; | |||
| release_info[release_pos].func = alloc_hugetlbfile_free; | |||
| } else { | |||
| new_release_info[release_pos-NUM_BUFFERS].address = map_address; | |||
| new_release_info[release_pos-NUM_BUFFERS].attr = fd; | |||
| new_release_info[release_pos-NUM_BUFFERS].func = alloc_hugetlbfile_free; | |||
| } | |||
| release_pos ++; | |||
| } | |||
| @@ -2621,8 +2715,25 @@ static volatile struct { | |||
| } memory[NUM_BUFFERS]; | |||
| static int memory_initialized = 0; | |||
| struct newmemstruct | |||
| { | |||
| BLASULONG lock; | |||
| void *addr; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| int pos; | |||
| #endif | |||
| int used; | |||
| #ifndef __64BIT__ | |||
| char dummy[48]; | |||
| #else | |||
| char dummy[40]; | |||
| #endif | |||
| }; | |||
| static volatile struct newmemstruct *newmemory; | |||
| static int memory_initialized = 0; | |||
| static int memory_overflowed = 0; | |||
| /* Memory allocation routine */ | |||
| /* procpos ... indicates where it comes from */ | |||
| /* 0 : Level 3 functions */ | |||
| @@ -2631,6 +2742,8 @@ static int memory_initialized = 0; | |||
| void *blas_memory_alloc(int procpos){ | |||
| int i; | |||
| int position; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| int mypos = 0; | |||
| @@ -2761,6 +2874,25 @@ void *blas_memory_alloc(int procpos){ | |||
| position ++; | |||
| } while (position < NUM_BUFFERS); | |||
| if (memory_overflowed) { | |||
| do { | |||
| RMB; | |||
| #if defined(USE_OPENMP) | |||
| if (!newmemory[position-NUM_BUFFERS].used) { | |||
| blas_lock(&newmemory[position-NUM_BUFFERS].lock); | |||
| #endif | |||
| if (!newmemory[position-NUM_BUFFERS].used) goto allocation2; | |||
| #if defined(USE_OPENMP) | |||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
| } | |||
| #endif | |||
| position ++; | |||
| } while (position < 512+NUM_BUFFERS); | |||
| } | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| @@ -2788,7 +2920,7 @@ void *blas_memory_alloc(int procpos){ | |||
| func = &memoryalloc[0]; | |||
| while ((func != NULL) && (map_address == (void *) -1)) { | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| @@ -2868,8 +3000,102 @@ void *blas_memory_alloc(int procpos){ | |||
| return (void *)memory[position].addr; | |||
| error: | |||
| printf("BLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| if (memory_overflowed) goto terminate; | |||
| fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n"); | |||
| memory_overflowed=1; | |||
| new_release_info = (struct release_t*) malloc(512*sizeof(struct release_t)); | |||
| newmemory = (struct newmemstruct*) malloc(512*sizeof(struct newmemstruct)); | |||
| for (i = 0; i < 512; i++) { | |||
| newmemory[i].addr = (void *)0; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| newmemory[i].pos = -1; | |||
| #endif | |||
| newmemory[i].used = 0; | |||
| newmemory[i].lock = 0; | |||
| } | |||
| allocation2: | |||
| newmemory[position-NUM_BUFFERS].used = 1; | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #else | |||
| blas_unlock(&newmemory[position-NUM_BUFFERS].lock); | |||
| #endif | |||
| do { | |||
| #ifdef DEBUG | |||
| printf("Allocation Start : %lx\n", base_address); | |||
| #endif | |||
| map_address = (void *)-1; | |||
| func = &memoryalloc[0]; | |||
| while ((*func != NULL) && (map_address == (void *) -1)) { | |||
| map_address = (*func)((void *)base_address); | |||
| #ifdef ALLOC_DEVICEDRIVER | |||
| if ((*func == alloc_devicedirver) && (map_address == (void *)-1)) { | |||
| fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n"); | |||
| } | |||
| #endif | |||
| #ifdef ALLOC_HUGETLBFILE | |||
| if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) { | |||
| #ifndef OS_WINDOWS | |||
| fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n"); | |||
| #endif | |||
| } | |||
| #endif | |||
| #if (defined ALLOC_SHM) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS) | |||
| if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1; | |||
| #endif | |||
| func ++; | |||
| } | |||
| #ifdef DEBUG | |||
| printf(" Success -> %08lx\n", map_address); | |||
| #endif | |||
| if (((BLASLONG) map_address) == -1) base_address = 0UL; | |||
| if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE; | |||
| } while ((BLASLONG)map_address == -1); | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| LOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| newmemory[position-NUM_BUFFERS].addr = map_address; | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf(" Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position); | |||
| #endif | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos; | |||
| #endif | |||
| return (void *)newmemory[position-NUM_BUFFERS].addr; | |||
| terminate: | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n"); | |||
| printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS); | |||
| printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n"); | |||
| printf("a sufficiently small number. This error typically occurs when the software that relies on\n"); | |||
| printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n"); | |||
| printf("cpu cores than what OpenBLAS was configured to handle.\n"); | |||
| return NULL; | |||
| } | |||
| @@ -2888,13 +3114,28 @@ void blas_memory_free(void *free_area){ | |||
| while ((position < NUM_BUFFERS) && (memory[position].addr != free_area)) | |||
| position++; | |||
| if (position >= NUM_BUFFERS) goto error; | |||
| if (position >= NUM_BUFFERS && !memory_overflowed) goto error; | |||
| #ifdef DEBUG | |||
| if (memory[position].addr != free_area) goto error; | |||
| printf(" Position : %d\n", position); | |||
| #endif | |||
| if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) { | |||
| while ((position < NUM_BUFFERS+512) && (newmemory[position-NUM_BUFFERS].addr != free_area)) | |||
| position++; | |||
| // arm: ensure all writes are finished before other thread takes this memory | |||
| WMB; | |||
| newmemory[position].used = 0; | |||
| #if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP) | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| #endif | |||
| #ifdef DEBUG | |||
| printf("Unmap from overflow area succeeded.\n\n"); | |||
| #endif | |||
| return; | |||
| } else { | |||
| // arm: ensure all writes are finished before other thread takes this memory | |||
| WMB; | |||
| @@ -2908,7 +3149,7 @@ void blas_memory_free(void *free_area){ | |||
| #endif | |||
| return; | |||
| } | |||
| error: | |||
| printf("BLAS : Bad memory unallocation! : %4d %p\n", position, free_area); | |||
| @@ -2943,7 +3184,10 @@ void blas_shutdown(void){ | |||
| LOCK_COMMAND(&alloc_lock); | |||
| for (pos = 0; pos < release_pos; pos ++) { | |||
| if (likely(pos < NUM_BUFFERS)) | |||
| release_info[pos].func(&release_info[pos]); | |||
| else | |||
| new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]); | |||
| } | |||
| #ifdef SEEK_ADDRESS | |||
| @@ -2960,6 +3204,15 @@ void blas_shutdown(void){ | |||
| #endif | |||
| memory[pos].lock = 0; | |||
| } | |||
| if (memory_overflowed) | |||
| for (pos = 0; pos < 512; pos ++){ | |||
| newmemory[pos].addr = (void *)0; | |||
| newmemory[pos].used = 0; | |||
| #if defined(WHEREAMI) && !defined(USE_OPENMP) | |||
| newmemory[pos].pos = -1; | |||
| #endif | |||
| newmemory[pos].lock = 0; | |||
| } | |||
| UNLOCK_COMMAND(&alloc_lock); | |||
| @@ -183,7 +183,7 @@ int get_L2_size(void){ | |||
| defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ | |||
| defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ | |||
| defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ | |||
| defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| cpuid(0x80000006, &eax, &ebx, &ecx, &edx); | |||
| @@ -269,7 +269,7 @@ void blas_set_parameter(void){ | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ | |||
| defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ | |||
| defined(SKYLAKEX) || defined(COOPERLAKE) | |||
| defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| @@ -524,6 +524,9 @@ void blas_set_parameter(void){ | |||
| xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||
| #endif | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -629,7 +632,9 @@ void blas_set_parameter(void){ | |||
| xgemm_p = 16 * (size + 1); | |||
| #endif | |||
| #ifdef BUILD_BFLOAT16 | |||
| sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; | |||
| #endif | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -717,7 +722,7 @@ void blas_set_parameter(void){ | |||
| #if defined(ARCH_MIPS64) | |||
| void blas_set_parameter(void){ | |||
| #if defined(LOONGSON3A) | |||
| #if defined(LOONGSON3R3) || defined(LOONGSON3R4) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1){ | |||
| #endif | |||
| @@ -731,20 +736,6 @@ void blas_set_parameter(void){ | |||
| #endif | |||
| #endif | |||
| #if defined(LOONGSON3B) | |||
| #ifdef SMP | |||
| if(blas_num_threads == 1 || blas_num_threads == 2){ | |||
| #endif | |||
| //single thread | |||
| dgemm_r = 640; | |||
| #ifdef SMP | |||
| }else{ | |||
| //multi thread | |||
| dgemm_r = 160; | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| #endif | |||
| @@ -139,9 +139,17 @@ endif | |||
| ifneq (,$(filter 1 2,$(NOFORTRAN))) | |||
| #only build without Fortran | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| ifeq ($(F_COMPILER), INTEL) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -all-load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def | |||
| else | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -fno-fortran-main -Mnomain -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| else | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -all_load -headerpad_max_install_names -install_name "$(CURDIR)/../$(INTERNALNAME)" -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) | |||
| endif | |||
| endif | |||
| endif | |||
| dllinit.$(SUFFIX) : dllinit.c | |||
| $(CC) $(CFLAGS) -c -o $(@F) -s $< | |||
| @@ -1,4 +1,4 @@ | |||
| #!/usr/bin/perl | |||
| #!/usr/bin/env perl | |||
| # Changelog | |||
| # 2017/09/03 staticfloat | |||