| @@ -0,0 +1,81 @@ | |||
| name: continuous build | |||
| on: [push, pull_request] | |||
| jobs: | |||
| build: | |||
| runs-on: ${{ matrix.os }} | |||
| strategy: | |||
| fail-fast: false | |||
| matrix: | |||
| os: [ubuntu-latest, macos-latest] | |||
| build: [cmake, make] | |||
| steps: | |||
| - name: Checkout repository | |||
| uses: actions/checkout@v2 | |||
| - name: Compilation cache | |||
| uses: actions/cache@v2 | |||
| with: | |||
| path: ~/.ccache | |||
| # We include the commit sha in the cache key, as new cache entries are | |||
| # only created if there is no existing entry for the key yet. | |||
| key: ${{ runner.os }}-ccache-${{ github.sha }} | |||
| # Restore any ccache cache entry, if none for | |||
| # ${{ runner.os }}-ccache-${{ github.sha }} exists | |||
| restore-keys: | | |||
| ${{ runner.os }}-ccache | |||
| - name: Print system information | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| cat /proc/cpuinfo | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| sysctl -a | grep machdep.cpu | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| - name: Install Dependencies | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| sudo apt-get install -y gfortran cmake ccache | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| brew install coreutils cmake ccache | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| ccache -M 300M # Limit the ccache size; Github's overall cache limit is 5GB | |||
| - name: Build | |||
| if: matrix.build == 'make' | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| make -j$(nproc) DYNAMIC_ARCH=1 USE_OPENMP=0 | |||
| - name: CMake build | |||
| if: matrix.build == 'cmake' | |||
| run: | | |||
| if [ "$RUNNER_OS" == "Linux" ]; then | |||
| export PATH="/usr/lib/ccache:${PATH}" | |||
| elif [ "$RUNNER_OS" == "macOS" ]; then | |||
| export PATH="$(brew --prefix)/opt/ccache/libexec:${PATH}" | |||
| else | |||
| echo "$RUNNER_OS not supported" | |||
| exit 1 | |||
| fi | |||
| mkdir build | |||
| cd build | |||
| cmake -DDYNAMIC_ARCH=1 -DNOFORTRAN=0 -DBUILD_WITHOUT_LAPACK=0 -DCMAKE_VERBOSE_MAKEFILE=ON -DCMAKE_BUILD_TYPE=Release .. | |||
| make -j$(nproc) | |||
| @@ -21,6 +21,7 @@ jobs: | |||
| build-OpenBLAS-with-Homebrew: | |||
| runs-on: macos-latest | |||
| env: | |||
| DEVELOPER_DIR: /Applications/Xcode_11.4.1.app/Contents/Developer | |||
| HOMEBREW_DEVELOPER: "ON" | |||
| HOMEBREW_DISPLAY_INSTALL_TIMES: "ON" | |||
| HOMEBREW_NO_ANALYTICS: "ON" | |||
| @@ -70,6 +70,7 @@ test/SBLAT2.SUMM | |||
| test/SBLAT3.SUMM | |||
| test/ZBLAT2.SUMM | |||
| test/ZBLAT3.SUMM | |||
| test/SHBLAT3.SUMM | |||
| test/cblat1 | |||
| test/cblat2 | |||
| test/cblat3 | |||
| @@ -79,6 +80,7 @@ test/dblat3 | |||
| test/sblat1 | |||
| test/sblat2 | |||
| test/sblat3 | |||
| test/test_shgemm | |||
| test/zblat1 | |||
| test/zblat2 | |||
| test/zblat3 | |||
| @@ -16,7 +16,6 @@ matrix: | |||
| before_script: &common-before | |||
| - COMMON_FLAGS="DYNAMIC_ARCH=1 TARGET=NEHALEM NUM_THREADS=32" | |||
| script: | |||
| - set -e | |||
| - make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| - make -C test $COMMON_FLAGS $BTYPE | |||
| - make -C ctest $COMMON_FLAGS $BTYPE | |||
| @@ -108,7 +107,6 @@ matrix: | |||
| - sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers' | |||
| before_script: *common-before | |||
| script: | |||
| - set -e | |||
| # XXX: Disable some warnings for now to avoid exceeding Travis limit for log size. | |||
| - alpine make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE | |||
| CFLAGS="-Wno-misleading-indentation -Wno-sign-conversion -Wno-incompatible-pointer-types" | |||
| @@ -151,7 +149,6 @@ matrix: | |||
| before_script: | |||
| - COMMON_ARGS="-DTARGET=NEHALEM -DNUM_THREADS=32" | |||
| script: | |||
| - set -e | |||
| - mkdir build | |||
| - CONFIG=Release | |||
| - cmake -Bbuild -H. $CMAKE_ARGS $COMMON_ARGS -DCMAKE_BUILD_TYPE=$CONFIG | |||
| @@ -6,7 +6,8 @@ cmake_minimum_required(VERSION 2.8.5) | |||
| project(OpenBLAS C ASM) | |||
| set(OpenBLAS_MAJOR_VERSION 0) | |||
| set(OpenBLAS_MINOR_VERSION 3) | |||
| set(OpenBLAS_PATCH_VERSION 9.dev) | |||
| set(OpenBLAS_PATCH_VERSION 10.dev) | |||
| set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") | |||
| # Adhere to GNU filesystem layout conventions | |||
| @@ -23,6 +24,7 @@ option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS fun | |||
| option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) | |||
| option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) | |||
| option(BUILD_RELAPACK "Build with ReLAPACK (recursive implementation of several LAPACK functions on top of standard LAPACK)" OFF) | |||
| option(USE_LOCKING "Use locks even in single-threaded builds to make them callable from multiple threads" OFF) | |||
| if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") | |||
| option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding processes from e.g. R or numpy/scipy to a single core" ON) | |||
| else() | |||
| @@ -86,9 +88,13 @@ if (NOT NO_LAPACK) | |||
| list(APPEND SUBDIRS lapack) | |||
| endif () | |||
| if (NOT DEFINED BUILD_HALF) | |||
| set (BUILD_HALF false) | |||
| endif () | |||
| # set which float types we want to build for | |||
| if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) | |||
| # if none are defined, build for all | |||
| # set(BUILD_HALF true) | |||
| set(BUILD_SINGLE true) | |||
| set(BUILD_DOUBLE true) | |||
| set(BUILD_COMPLEX true) | |||
| @@ -120,6 +126,11 @@ if (BUILD_COMPLEX16) | |||
| list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE | |||
| endif () | |||
| if (BUILD_HALF) | |||
| message(STATUS "Building Half Precision") | |||
| list(APPEND FLOAT_TYPES "HALF") # defines nothing | |||
| endif () | |||
| if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") | |||
| message(FATAL_ERROR "Detecting CPU failed. Please set TARGET explicitly, e.g. make TARGET=your_cpu_target. Please read README for details.") | |||
| endif () | |||
| @@ -234,7 +245,7 @@ if (BUILD_SHARED_LIBS AND BUILD_RELAPACK) | |||
| if (NOT MSVC) | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "-Wl,-allow-multiple-definition") | |||
| else() | |||
| target_link_libraries(${OpenBLAS_LIBNAME} "/FORCE:MULTIPLE") | |||
| set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /FORCE:MULTIPLE") | |||
| endif() | |||
| endif() | |||
| @@ -180,3 +180,13 @@ In chronological order: | |||
| * [2019-12-23] optimize AVX2 CGEMM and ZGEMM | |||
| * [2019-12-30] AVX2 CGEMM3M & ZGEMM3M kernels | |||
| * [2020-01-07] optimize AVX2 SGEMM and STRMM | |||
| * Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR> | |||
| * [2020-04-15] Half-precision GEMM for bfloat16 | |||
| * Marius Hillenbrand <https://github.com/mhillenibm> | |||
| * [2020-05-12] Revise dynamic architecture detection for IBM z | |||
| * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 | |||
| * Danfeng Zhang <https://github.com/craft-zhang> | |||
| * [2020-05-20] Improve performance of SGEMM and STRMM on Arm Cortex-A53 | |||
| @@ -1,4 +1,77 @@ | |||
| OpenBLAS ChangeLog | |||
| ==================================================================== | |||
| Version 0.3.10 | |||
| 14-Jun-2020 | |||
| common: | |||
| * Improved thread locking behaviour in blas_server and parallel getrf | |||
| * Imported bugfix 394 from LAPACK (spurious reference to "XERBL" | |||
| due to overlong lines) | |||
| * Imported bugfix 403 from LAPACK (compile option "recursive" required | |||
| for correctness with Intel and PGI) | |||
| * Imported bugfix 408 from LAPACK (wrong scaling in ZHEEQUB) | |||
| * Imported bugfix 411 from LAPACK (infinite loop in LARGV/LARTG/LARTGP) | |||
| * Fixed mismatches between BUFFERSIZE and GEMM_UNROLL parameters that | |||
| could lead to crashes at large matrix sizes | |||
| * Restored internal soname in dynamic libraries on FreeBSD and Dragonfly | |||
| * Added API (openblas_setaffinity) to set the thread affinity on Linux | |||
| * Added initial infrastructure for half-precision floating point | |||
| (bfloat16) support with a generic implementation of SHGEMM | |||
| * Added CMAKE build system support for building the cblas_Xgemm3m | |||
| functions | |||
| * Fixed CMAKE support for building in a path with embedded spaces | |||
| * Fixed CMAKE (non)handling of NO_EXPRECISION and MAX_STACK_ALLOC | |||
| * Fixed GCC version detection in the Makefiles | |||
| * Allowed overriding the names of AR, AS and LD in Makefile builds | |||
| POWER: | |||
| * Fixed big-endian POWER8 ELFv2 builds on FreeBSD | |||
| * Fixed GCC version checks and DYNAMIC_ARCH builds on POWER9 | |||
| * Fixed CMAKE build support for POWER9 | |||
| * fixed a potential race condition in the thread buffer allocation | |||
| * Worked around LAPACK test failures on PPC G4 | |||
| MIPS: | |||
| * Fixed a potential race condition in the thread buffer allocation | |||
| * Added support for MIPS 24K/24KE family based on P5600 kernels | |||
| MIPS64: | |||
| * fixed a potential race condition in the thread buffer allocation | |||
| * Added TARGET=GENERIC | |||
| ARMV7: | |||
| * Fixed a race condition in the thread buffer allocation | |||
| ARMV8: | |||
| * Fixed a race condition in the thread buffer allocation | |||
| * Fixed zero initialisation in the assembly for SGEMM and DGEMM BETA | |||
| * Improved performance of the ThunderX2 DAXPY kernel | |||
| * Added an optimized SGEMM kernel for Cortex A53 | |||
| * Fixed Makefile support for INTERFACE64 (8-byte integer) | |||
| x86_64: | |||
| * Fixed a syntax error in the CMAKE setup for SkylakeX | |||
| * Improved performance of STRSM on Haswell, SkylakeX and Ryzen | |||
| * Improved SGEMM performance on SGEMM for workloads with ldc a | |||
| multiple of 1024 | |||
| * Improved DGEMM performance on Skylake X | |||
| * Fixed unwanted AVX512-dependency of SGEMM in DYNAMIC_ARCH | |||
| builds created on SkylakeX | |||
| * Removed data alignment requirement in the SSE2 copy kernels | |||
| that could cause spurious crashes | |||
| * Added a workaround for an optimizer bug in AppleClang 11.0.3 | |||
| * Fixed LAPACK test failures due to wrong options for Intel Fortran | |||
| * Fixed compilation and LAPACK test results with recent Flang | |||
| and AMD AOCC | |||
| * Fixed DYNAMIC_ARCH builds with CMAKE on OS X | |||
| * Fixed missing exports of cblas_i?amin, cblas_i?min, cblas_i?max, | |||
| cblas_?sum, cblas_?gemm3m in the shared library on OS | |||
| * Fixed reporting of cpu name in DYNAMIC_ARCH builds (would sometimes | |||
| show the name of an older generation chip supported by the same kernels) | |||
| IBM Z: | |||
| * Improved performance of SGEMM/STRMM and DGEMM/DTRMM on Z14 | |||
| ==================================================================== | |||
| Version 0.3.9 | |||
| 1-Mar-2020 | |||
| @@ -264,6 +264,7 @@ lapack_prebuild : | |||
| ifeq ($(NOFORTRAN), $(filter 0,$(NOFORTRAN))) | |||
| -@echo "FC = $(FC)" > $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_DRV = $(LAPACK_FFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "POPTS = $(LAPACK_FPFLAGS)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "FFLAGS_NOOPT = -O0 $(LAPACK_NOOPT)" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| -@echo "PNOOPT = $(LAPACK_FPFLAGS) -O0" >> $(NETLIB_LAPACK_DIR)/make.inc | |||
| @@ -9,6 +9,16 @@ else | |||
| USE_OPENMP = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| else | |||
| COMMON_OPT += -Ofast -mcpu=future -mtune=future -mvsx -malign-power -fno-fast-math | |||
| FCOMMON_OPT += -O2 -frecursive -mcpu=future -mtune=future -malign-power -fno-fast-math | |||
| endif | |||
| endif | |||
| ifeq ($(CORE), POWER9) | |||
| ifeq ($(USE_OPENMP), 1) | |||
| COMMON_OPT += -Ofast -mcpu=power9 -mtune=power9 -mvsx -malign-power -DUSE_OPENMP -fno-fast-math -fopenmp | |||
| @@ -17,7 +17,11 @@ ifdef CPUIDEMU | |||
| EXFLAGS = -DCPUIDEMU -DVENDOR=99 | |||
| endif | |||
| ifeq ($(TARGET), 1004K) | |||
| ifeq ($(TARGET), MIPS24K) | |||
| TARGET_FLAGS = -mips32r2 | |||
| endif | |||
| ifeq ($(TARGET), MIPS1004K) | |||
| TARGET_FLAGS = -mips32r2 | |||
| endif | |||
| @@ -3,7 +3,7 @@ | |||
| # | |||
| # This library's version | |||
| VERSION = 0.3.9.dev | |||
| VERSION = 0.3.10.dev | |||
| # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a | |||
| # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library | |||
| @@ -273,6 +273,9 @@ COMMON_PROF = -pg | |||
| # | |||
| # CPP_THREAD_SAFETY_TEST = 1 | |||
| # If you want to enable the experimental BFLOAT16 support | |||
| # BUILD_HALF = 1 | |||
| # | |||
| # End of user configuration | |||
| # | |||
| @@ -21,6 +21,8 @@ ifeq ($(ARCH), amd64) | |||
| override ARCH=x86_64 | |||
| else ifeq ($(ARCH), powerpc64) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), powerpc) | |||
| override ARCH=power | |||
| else ifeq ($(ARCH), i386) | |||
| override ARCH=x86 | |||
| else ifeq ($(ARCH), aarch64) | |||
| @@ -261,10 +263,10 @@ endif | |||
| ARFLAGS = | |||
| CPP = $(COMPILER) -E | |||
| AR = $(CROSS_SUFFIX)ar | |||
| AS = $(CROSS_SUFFIX)as | |||
| LD = $(CROSS_SUFFIX)ld | |||
| RANLIB = $(CROSS_SUFFIX)ranlib | |||
| AR ?= $(CROSS_SUFFIX)ar | |||
| AS ?= $(CROSS_SUFFIX)as | |||
| LD ?= $(CROSS_SUFFIX)ld | |||
| RANLIB ?= $(CROSS_SUFFIX)ranlib | |||
| NM = $(CROSS_SUFFIX)nm | |||
| DLLWRAP = $(CROSS_SUFFIX)dllwrap | |||
| OBJCOPY = $(CROSS_SUFFIX)objcopy | |||
| @@ -277,6 +279,17 @@ NO_LAPACK = 1 | |||
| override FEXTRALIB = | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| endif | |||
| # | |||
| # OS dependent settings | |||
| # | |||
| @@ -323,13 +336,7 @@ ifeq ($(C_COMPILER), CLANG) | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| #Version tests for supporting specific features (MS_ABI, POWER9 intrinsics) | |||
| GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) | |||
| GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) | |||
| GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) | |||
| GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) | |||
| GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) | |||
| ifeq ($(GCCVERSIONGT4), 1) | |||
| # GCC Major version > 4 | |||
| # It is compatible with MSVC ABI. | |||
| @@ -343,7 +350,6 @@ ifeq ($(GCCMINORVERSIONGTEQ7), 1) | |||
| CCOMMON_OPT += -DMS_ABI | |||
| endif | |||
| endif | |||
| endif | |||
| # Ensure the correct stack alignment on Win32 | |||
| # http://permalink.gmane.org/gmane.comp.lib.openblas.general/97 | |||
| @@ -563,8 +569,34 @@ DYNAMIC_CORE += EMAG8180 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| DYNAMIC_CORE = Z13 | |||
| DYNAMIC_CORE = ZARCH_GENERIC | |||
| # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| else ifeq ($(GCCVERSIONEQ5), 1) | |||
| ifeq ($(GCCMINORVERSIONGTEQ2), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) | |||
| ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1) | |||
| ZARCH_SUPPORT_Z13 := 1 | |||
| endif | |||
| endif | |||
| ifeq ($(ZARCH_SUPPORT_Z13), 1) | |||
| DYNAMIC_CORE += Z13 | |||
| else | |||
| $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) | |||
| endif | |||
| ifeq ($(GCCVERSIONGTEQ7), 1) | |||
| DYNAMIC_CORE += Z14 | |||
| else | |||
| $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) | |||
| endif | |||
| endif | |||
| ifeq ($(ARCH), power) | |||
| @@ -572,14 +604,20 @@ DYNAMIC_CORE = POWER6 | |||
| DYNAMIC_CORE += POWER8 | |||
| ifneq ($(C_COMPILER), GCC) | |||
| DYNAMIC_CORE += POWER9 | |||
| DYNAMIC_CORE += POWER10 | |||
| endif | |||
| ifeq ($(C_COMPILER), GCC) | |||
| GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) | |||
| ifeq ($(GCCVERSIONGT5), 1) | |||
| DYNAMIC_CORE += POWER9 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) | |||
| endif | |||
| GCCVERSIONGTEQ11 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 11) | |||
| ifeq ($(GCCVERSIONGTEQ11), 1) | |||
| DYNAMIC_CORE += POWER10 | |||
| else | |||
| $(info, OpenBLAS: Your gcc version is too old to build the POWER10 kernels.) | |||
| endif | |||
| endif | |||
| endif | |||
| @@ -690,7 +728,12 @@ CCOMMON_OPT += -march=mips64 | |||
| FCOMMON_OPT += -march=mips64 | |||
| endif | |||
| ifeq ($(CORE), 1004K) | |||
| ifeq ($(CORE), MIPS24K) | |||
| CCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r2 -mtune=24kc $(MSA_FLAGS) | |||
| endif | |||
| ifeq ($(CORE), MIPS1004K) | |||
| CCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| FCOMMON_OPT += -mips32r2 $(MSA_FLAGS) | |||
| endif | |||
| @@ -755,6 +798,15 @@ endif | |||
| ifeq ($(F_COMPILER), FLANG) | |||
| CCOMMON_OPT += -DF_INTERFACE_FLANG | |||
| FCOMMON_OPT += -Mrecursive -Kieee | |||
| ifeq ($(OSNAME), Linux) | |||
| ifeq ($(ARCH), x86_64) | |||
| FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) | |||
| ifeq ($(FLANG_VENDOR),AOCC) | |||
| FCOMMON_OPT += -fno-unroll-loops | |||
| endif | |||
| endif | |||
| endif | |||
| ifdef BINARY64 | |||
| ifdef INTERFACE64 | |||
| ifneq ($(INTERFACE64), 0) | |||
| @@ -850,7 +902,7 @@ ifneq ($(INTERFACE64), 0) | |||
| FCOMMON_OPT += -i8 | |||
| endif | |||
| endif | |||
| FCOMMON_OPT += -recursive | |||
| FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens | |||
| ifeq ($(USE_OPENMP), 1) | |||
| FCOMMON_OPT += -fopenmp | |||
| endif | |||
| @@ -1119,6 +1171,10 @@ ifeq ($(USE_TLS), 1) | |||
| CCOMMON_OPT += -DUSE_TLS | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| CCOMMON_OPT += -DBUILD_HALF | |||
| endif | |||
| CCOMMON_OPT += -DVERSION=\"$(VERSION)\" | |||
| ifndef SYMBOLPREFIX | |||
| @@ -1145,6 +1201,7 @@ KERNELDIR = $(TOPDIR)/kernel/$(ARCH) | |||
| include $(TOPDIR)/Makefile.$(ARCH) | |||
| CCOMMON_OPT += -UASMNAME -UASMFNAME -UNAME -UCNAME -UCHAR_NAME -UCHAR_CNAME | |||
| CCOMMON_OPT += -DASMNAME=$(FU)$(*F) -DASMFNAME=$(FU)$(*F)$(BU) -DNAME=$(*F)$(BU) -DCNAME=$(*F) -DCHAR_NAME=\"$(*F)$(BU)\" -DCHAR_CNAME=\"$(*F)\" | |||
| ifeq ($(CORE), PPC440) | |||
| @@ -1237,7 +1294,6 @@ endif | |||
| override CFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) | |||
| override PFLAGS += $(COMMON_OPT) $(CCOMMON_OPT) -I$(TOPDIR) -DPROFILE $(COMMON_PROF) | |||
| override FFLAGS += $(COMMON_OPT) $(FCOMMON_OPT) | |||
| override FPFLAGS += $(FCOMMON_OPT) $(COMMON_PROF) | |||
| #MAKEOVERRIDES = | |||
| @@ -1344,6 +1400,7 @@ export ARCH | |||
| export CORE | |||
| export LIBCORE | |||
| export __BYTE_ORDER__ | |||
| export ELF_VERSION | |||
| export PGCPATH | |||
| export CONFIG | |||
| export CC | |||
| @@ -1389,7 +1446,10 @@ export KERNELDIR | |||
| export FUNCTION_PROFILE | |||
| export TARGET_CORE | |||
| export NO_AVX512 | |||
| export BUILD_HALF | |||
| export SHGEMM_UNROLL_M | |||
| export SHGEMM_UNROLL_N | |||
| export SGEMM_UNROLL_M | |||
| export SGEMM_UNROLL_N | |||
| export DGEMM_UNROLL_M | |||
| @@ -1,3 +1,4 @@ | |||
| SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -9,8 +10,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) | |||
| BLASOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| BLASOBJS_P = $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) | |||
| BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) | |||
| ifdef EXPRECISION | |||
| BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| @@ -22,6 +23,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) | |||
| endif | |||
| $(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX | |||
| $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX | |||
| $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX | |||
| $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX | |||
| @@ -29,6 +31,7 @@ $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX | |||
| $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX | |||
| $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX | |||
| $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) | |||
| @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector | |||
| endif | |||
| ifeq ($(CORE), Z14) | |||
| CCOMMON_OPT += -march=z14 -mzvector | |||
| CCOMMON_OPT += -march=z14 -mzvector -O3 | |||
| FCOMMON_OPT += -march=z14 -mzvector | |||
| endif | |||
| @@ -122,6 +122,11 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th | |||
| - **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. | |||
| - **AMD ZEN**: Uses Haswell codes with some optimizations. | |||
| #### MIPS32 | |||
| - **MIPS 1004K**: uses P5600 codes | |||
| - **MIPS 24K**: uses P5600 codes | |||
| #### MIPS64 | |||
| - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. | |||
| @@ -49,6 +49,7 @@ POWER6 | |||
| POWER7 | |||
| POWER8 | |||
| POWER9 | |||
| POWER10 | |||
| PPCG4 | |||
| PPC970 | |||
| PPC970MP | |||
| @@ -58,7 +59,8 @@ CELL | |||
| 3.MIPS CPU: | |||
| P5600 | |||
| 1004K | |||
| MIPS1004K | |||
| MIPS24K | |||
| 4.MIPS64 CPU: | |||
| SICORTEX | |||
| @@ -49,3 +49,23 @@ jobs: | |||
| # we need a privileged docker run for sde process attachment | |||
| docker run --privileged intel_sde | |||
| displayName: 'Run AVX512 SkylakeX docker build / test' | |||
| - job: Windows_cl | |||
| pool: | |||
| vmImage: 'windows-latest' | |||
| steps: | |||
| - task: CMake@1 | |||
| inputs: | |||
| workingDirectory: 'build' # Optional | |||
| cmakeArgs: '-G "Visual Studio 16 2019" ..' | |||
| - task: CMake@1 | |||
| inputs: | |||
| cmakeArgs: '--build . --config Release' | |||
| workingDirectory: 'build' | |||
| - script: | | |||
| cd build | |||
| cd utest | |||
| dir | |||
| openblas_utest.exe | |||
| @@ -49,6 +49,12 @@ else | |||
| GOTO_LAPACK_TARGETS= | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| GOTO_HALF_TARGETS=shgemm.goto | |||
| else | |||
| GOTO_HALF_TARGETS= | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ | |||
| sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ | |||
| spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ | |||
| ssymm.goto dsymm.goto csymm.goto zsymm.goto \ | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ | |||
| samin.goto damin.goto camin.goto zamin.goto \ | |||
| smin.goto dmin.goto \ | |||
| saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) | |||
| snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) | |||
| acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ | |||
| scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ | |||
| @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| ##################################### Sgemm #################################################### | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| endif | |||
| sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| @@ -1814,7 +1825,7 @@ zsymv.veclib : zsymv.$(SUFFIX) | |||
| ##################################### Sgeev #################################################### | |||
| sgeev.goto : sgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| sgeev.acml : sgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1830,7 +1841,7 @@ sgeev.veclib : sgeev.$(SUFFIX) | |||
| ##################################### Dgeev #################################################### | |||
| dgeev.goto : dgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dgeev.acml : dgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1847,7 +1858,7 @@ dgeev.veclib : dgeev.$(SUFFIX) | |||
| ##################################### Cgeev #################################################### | |||
| cgeev.goto : cgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| cgeev.acml : cgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1864,7 +1875,7 @@ cgeev.veclib : cgeev.$(SUFFIX) | |||
| ##################################### Zgeev #################################################### | |||
| zgeev.goto : zgeev.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| zgeev.acml : zgeev.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1880,7 +1891,7 @@ zgeev.veclib : zgeev.$(SUFFIX) | |||
| ##################################### Sgetri #################################################### | |||
| sgetri.goto : sgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| sgetri.acml : sgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1896,7 +1907,7 @@ sgetri.veclib : sgetri.$(SUFFIX) | |||
| ##################################### Dgetri #################################################### | |||
| dgetri.goto : dgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| dgetri.acml : dgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1913,7 +1924,7 @@ dgetri.veclib : dgetri.$(SUFFIX) | |||
| ##################################### Cgetri #################################################### | |||
| cgetri.goto : cgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| cgetri.acml : cgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -1930,7 +1941,7 @@ cgetri.veclib : cgetri.$(SUFFIX) | |||
| ##################################### Zgetri #################################################### | |||
| zgetri.goto : zgetri.$(SUFFIX) ../$(LIBNAME) | |||
| $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| $(FC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm | |||
| zgetri.acml : zgetri.$(SUFFIX) | |||
| -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBACML) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) | |||
| @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c | |||
| zcholesky.$(SUFFIX) : cholesky.c | |||
| $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) : gemm.c | |||
| $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| endif | |||
| sgemm.$(SUFFIX) : gemm.c | |||
| $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ | |||
| @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifdef DOUBLE | |||
| #define GEMM BLASFUNC(dgemm) | |||
| #elif defined(HALF) | |||
| #define GEMM BLASFUNC(shgemm) | |||
| #else | |||
| #define GEMM BLASFUNC(sgemm) | |||
| #endif | |||
| @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ | |||
| int main(int argc, char *argv[]){ | |||
| FLOAT *a, *b, *c; | |||
| IFLOAT *a, *b; | |||
| FLOAT *c; | |||
| FLOAT alpha[] = {1.0, 0.0}; | |||
| FLOAT beta [] = {0.0, 0.0}; | |||
| char transa = 'N'; | |||
| @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ | |||
| k = to; | |||
| } | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { | |||
| if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { | |||
| if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { | |||
| @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ | |||
| #endif | |||
| for (i = 0; i < m * k * COMPSIZE; i++) { | |||
| a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < k * n * COMPSIZE; i++) { | |||
| b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for (i = 0; i < m * n * COMPSIZE; i++) { | |||
| c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| @@ -170,9 +170,11 @@ int main(int argc, char *argv[]){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| #ifdef RETURN_BY_STACK | |||
| DOT (&result , &m, x, &inc_x, y, &inc_y ); | |||
| #else | |||
| result = DOT (&m, x, &inc_x, y, &inc_y ); | |||
| #endif | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| @@ -310,6 +310,7 @@ $linker_a = ""; | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /[0-9]+/) | |||
| ) { | |||
| $linker_l .= $flags . " " | |||
| } | |||
| @@ -49,7 +49,7 @@ if (DYNAMIC_ARCH) | |||
| endif () | |||
| if (POWER) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9) | |||
| set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) | |||
| endif () | |||
| if (X86) | |||
| @@ -16,6 +16,7 @@ if (${F_COMPILER} STREQUAL "FLANG") | |||
| if (USE_OPENMP) | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -fopenmp") | |||
| endif () | |||
| set(FCOMMON_OPT "${FCOMMON_OPT} -Mrecursive -Kieee") | |||
| endif () | |||
| if (${F_COMPILER} STREQUAL "G77") | |||
| @@ -113,11 +113,31 @@ macro(SetDefaultL1) | |||
| set(ZSUMKERNEL zsum.S) | |||
| set(QSUMKERNEL sum.S) | |||
| set(XSUMKERNEL zsum.S) | |||
| if (BUILD_HALF) | |||
| set(SHAMINKERNEL ../arm/amin.c) | |||
| set(SHAMAXKERNEL ../arm/amax.c) | |||
| set(SHMAXKERNEL ../arm/max.c) | |||
| set(SHMINKERNEL ../arm/min.c) | |||
| set(ISHAMAXKERNEL ../arm/iamax.c) | |||
| set(ISHAMINKERNEL ../arm/iamin.c) | |||
| set(ISHMAXKERNEL ../arm/imax.c) | |||
| set(ISHMINKERNEL ../arm/imin.c) | |||
| set(SHASUMKERNEL ../arm/asum.c) | |||
| set(SHAXPYKERNEL ../arm/axpy.c) | |||
| set(SHAXPBYKERNEL ../arm/axpby.c) | |||
| set(SHCOPYKERNEL ../arm/copy.c) | |||
| set(SHDOTKERNEL ../arm/dot.c) | |||
| set(SHROTKERNEL ../arm/rot.c) | |||
| set(SHSCALKERNEL ../arm/scal.c) | |||
| set(SHNRM2KERNEL ../arm/nrm2.c) | |||
| set(SHSUMKERNEL ../arm/sum.c) | |||
| set(SHSWAPKERNEL ../arm/swap.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL2) | |||
| set(SGEMVNKERNEL gemv_n.S) | |||
| set(SGEMVTKERNEL gemv_t.S) | |||
| set(SGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(DGEMVNKERNEL gemv_n.S) | |||
| set(DGEMVTKERNEL gemv_t.S) | |||
| set(CGEMVNKERNEL zgemv_n.S) | |||
| @@ -161,6 +181,11 @@ macro(SetDefaultL2) | |||
| set(XHEMV_L_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_V_KERNEL ../generic/zhemv_k.c) | |||
| set(XHEMV_M_KERNEL ../generic/zhemv_k.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEMVNKERNEL ../arm/gemv_n.c) | |||
| set(SHGEMVTKERNEL ../arm/gemv_t.c) | |||
| set(SHGERKERNEL ../generic/ger.c) | |||
| endif () | |||
| endmacro () | |||
| macro(SetDefaultL3) | |||
| @@ -168,4 +193,18 @@ macro(SetDefaultL3) | |||
| set(DGEADD_KERNEL ../generic/geadd.c) | |||
| set(CGEADD_KERNEL ../generic/zgeadd.c) | |||
| set(ZGEADD_KERNEL ../generic/zgeadd.c) | |||
| if (BUILD_HALF) | |||
| set(SHGEADD_KERNEL ../generic/geadd.c) | |||
| set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) | |||
| set(SHGEMM_BETA ../generic/gemm_beta.c) | |||
| set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) | |||
| set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) | |||
| set(SHGEMMINCOPYOBJ shgemm_incopy.o) | |||
| set(SHGEMMITCOPYOBJ shgemm_itcopy.o) | |||
| set(SHGEMMONCOPYOBJ shgemm_oncopy.o) | |||
| set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) | |||
| endif () | |||
| endmacro () | |||
| @@ -8,7 +8,7 @@ if (${CMAKE_SYSTEM_NAME} STREQUAL "Linux") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly") | |||
| if (${CMAKE_SYSTEM_NAME} MATCHES "FreeBSD|OpenBSD|NetBSD|DragonFly|Darwin") | |||
| set(EXTRALIB "${EXTRALIB} -lm") | |||
| set(NO_EXPRECISION 1) | |||
| endif () | |||
| @@ -16,6 +16,8 @@ | |||
| # HAVE_SSE2 | |||
| # HAVE_SSE3 | |||
| # MAKE | |||
| # SHGEMM_UNROLL_M | |||
| # SHGEMM_UNROLL_N | |||
| # SGEMM_UNROLL_M | |||
| # SGEMM_UNROLL_N | |||
| # DGEMM_UNROLL_M | |||
| @@ -418,7 +420,7 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_M 8) | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| elseif ("${TCORE}" STREQUAL "POWER9") | |||
| elseif ("${TCORE}" STREQUAL "POWER9" OR "${TCORE}" STREQUAL "POWER10") | |||
| file(APPEND ${TARGET_CONF_TEMP} | |||
| "#define L1_DATA_SIZE 32768\n" | |||
| "#define L1_DATA_LINESIZE 128\n" | |||
| @@ -437,6 +439,8 @@ if (DEFINED CORE AND CMAKE_CROSSCOMPILING AND NOT (${HOST_OS} STREQUAL "WINDOWSS | |||
| set(ZGEMM_UNROLL_N 2) | |||
| set(SYMV_P 8) | |||
| endif() | |||
| set(SHGEMM_UNROLL_M 8) | |||
| set(SHGEMM_UNROLL_N 4) | |||
| # Or should this actually be NUM_CORES? | |||
| if (${NUM_THREADS} GREATER 0) | |||
| @@ -488,7 +492,7 @@ else(NOT CMAKE_CROSSCOMPILING) | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH_RESULT ${GETARCH_DIR} | |||
| SOURCES ${GETARCH_SRC} | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I${GETARCH_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} -I"${GETARCH_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| OUTPUT_VARIABLE GETARCH_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH_BIN} | |||
| ) | |||
| @@ -516,7 +520,7 @@ execute_process(COMMAND "${PROJECT_BINARY_DIR}/${GETARCH_BIN}" 1 OUTPUT_VARIABLE | |||
| if (NOT "${CMAKE_SYSTEM_NAME}" STREQUAL "WindowsStore") | |||
| try_compile(GETARCH2_RESULT ${GETARCH2_DIR} | |||
| SOURCES ${PROJECT_SOURCE_DIR}/getarch_2nd.c | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I${GETARCH2_DIR} -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| COMPILE_DEFINITIONS ${EXFLAGS} ${GETARCH_FLAGS} ${GETARCH2_FLAGS} -I"${GETARCH2_DIR}" -I"${PROJECT_SOURCE_DIR}" -I"${PROJECT_BINARY_DIR}" | |||
| OUTPUT_VARIABLE GETARCH2_LOG | |||
| COPY_FILE ${PROJECT_BINARY_DIR}/${GETARCH2_BIN} | |||
| ) | |||
| @@ -297,6 +297,16 @@ if (USE_SIMPLE_THREADED_LEVEL3) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DUSE_SIMPLE_THREADED_LEVEL3") | |||
| endif () | |||
| if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows") | |||
| if (DEFINED MAX_STACK_ALLOC) | |||
| if (NOT ${MAX_STACK_ALLOC} EQUAL 0) | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=${MAX_STACK_ALLOC}") | |||
| endif () | |||
| else () | |||
| set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") | |||
| endif () | |||
| endif () | |||
| if (DEFINED LIBNAMESUFFIX) | |||
| set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") | |||
| else () | |||
| @@ -407,6 +417,14 @@ if (${CMAKE_C_COMPILER} STREQUAL "LSB" OR ${CMAKE_SYSTEM_NAME} STREQUAL "Windows | |||
| set(LAPACK_CFLAGS "${LAPACK_CFLAGS} -DLAPACK_COMPLEX_STRUCTURE") | |||
| endif () | |||
| if ("${CMAKE_BUILD_TYPE}" STREQUAL "Release") | |||
| if ("${F_COMPILER}" STREQUAL "FLANG") | |||
| if (${CMAKE_Fortran_COMPILER_VERSION} VERSION_LESS_EQUAL 3) | |||
| set(CMAKE_Fortran_FLAGS_RELEASE "${CMAKE_Fortran_FLAGS_RELEASE} -fno-unroll-loops") | |||
| endif () | |||
| endif () | |||
| endif () | |||
| if (NOT DEFINED SUFFIX) | |||
| set(SUFFIX o) | |||
| endif () | |||
| @@ -530,6 +548,8 @@ endif () | |||
| #export FUNCTION_PROFILE | |||
| #export TARGET_CORE | |||
| # | |||
| #export SHGEMM_UNROLL_M | |||
| #export SHGEMM_UNROLL_N | |||
| #export SGEMM_UNROLL_M | |||
| #export SGEMM_UNROLL_N | |||
| #export DGEMM_UNROLL_M | |||
| @@ -15,12 +15,36 @@ endfunction () | |||
| # Reads a Makefile into CMake vars. | |||
| macro(ParseMakefileVars MAKEFILE_IN) | |||
| message(STATUS "Reading vars from ${MAKEFILE_IN}...") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| file(STRINGS ${MAKEFILE_IN} makefile_contents) | |||
| foreach (makefile_line ${makefile_contents}) | |||
| #message(STATUS "parsing ${makefile_line}") | |||
| if (${IfElse} GREATER 0) | |||
| string(REGEX MATCH "endif[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ENDIF ${makefile_line}") | |||
| set (IfElse 0) | |||
| set (ElseSeen 0) | |||
| continue () | |||
| endif () | |||
| string(REGEX MATCH "else[ \t]*" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "ELSE ${makefile_line}") | |||
| set (ElseSeen 1) | |||
| continue () | |||
| endif() | |||
| if ( (${IfElse} EQUAL 2 AND ${ElseSeen} EQUAL 0) OR ( ${IfElse} EQUAL 1 AND ${ElseSeen} EQUAL 1)) | |||
| # message(STATUS "skipping ${makefile_line}") | |||
| continue () | |||
| endif () | |||
| endif () | |||
| string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on ${line_match}") | |||
| set(var_name ${CMAKE_MATCH_1}) | |||
| set(var_value ${CMAKE_MATCH_2}) | |||
| # set(var_value ${CMAKE_MATCH_2}) | |||
| string(STRIP ${CMAKE_MATCH_2} var_value) | |||
| # check for Makefile variables in the string, e.g. $(TSUFFIX) | |||
| string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) | |||
| foreach (make_var ${make_var_matches}) | |||
| @@ -33,7 +57,31 @@ macro(ParseMakefileVars MAKEFILE_IN) | |||
| else () | |||
| string(REGEX MATCH "include \\$\\(KERNELDIR\\)/(.+)$" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| #message(STATUS "match on include ${line_match}") | |||
| ParseMakefileVars(${KERNELDIR}/${CMAKE_MATCH_1}) | |||
| else () | |||
| # message(STATUS "unmatched line ${line_match}") | |||
| string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| else () | |||
| string(REGEX MATCH "ifneq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") | |||
| if (NOT "${line_match}" STREQUAL "") | |||
| # message(STATUS "IFNEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") | |||
| if (NOT ( ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})) | |||
| # message (STATUS "condition is true") | |||
| set (IfElse 1) | |||
| else () | |||
| set (IfElse 2) | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endif () | |||
| endforeach () | |||
| @@ -163,6 +211,7 @@ function(GenerateNamedObjects sources_in) | |||
| if (complex_only) | |||
| list(REMOVE_ITEM float_list "SINGLE") | |||
| list(REMOVE_ITEM float_list "DOUBLE") | |||
| list(REMOVE_ITEM float_list "HALF") | |||
| elseif (real_only) | |||
| list(REMOVE_ITEM float_list "COMPLEX") | |||
| list(REMOVE_ITEM float_list "ZCOMPLEX") | |||
| @@ -176,6 +225,9 @@ function(GenerateNamedObjects sources_in) | |||
| if (NOT no_float_type) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| string(TOLOWER ${float_char} float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "sh") | |||
| endif () | |||
| endif () | |||
| if (NOT name_in) | |||
| @@ -210,6 +262,9 @@ function(GenerateNamedObjects sources_in) | |||
| if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| list(APPEND obj_defines "DOUBLE") | |||
| endif () | |||
| if (${float_type} STREQUAL "HALF") | |||
| list(APPEND obj_defines "HALF") | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| list(APPEND obj_defines "COMPLEX") | |||
| if (mangle_complex_sources) | |||
| @@ -257,6 +257,11 @@ typedef long BLASLONG; | |||
| typedef unsigned long BLASULONG; | |||
| #endif | |||
| #ifndef BFLOAT16 | |||
| typedef unsigned short bfloat16; | |||
| #define HALFCONVERSION 1 | |||
| #endif | |||
| #ifdef USE64BITINT | |||
| typedef BLASLONG blasint; | |||
| #if defined(OS_WINDOWS) && defined(__64BIT__) | |||
| @@ -297,6 +302,13 @@ typedef int blasint; | |||
| #define SIZE 8 | |||
| #define BASE_SHIFT 3 | |||
| #define ZBASE_SHIFT 4 | |||
| #elif defined(HALF) | |||
| #define IFLOAT bfloat16 | |||
| #define XFLOAT IFLOAT | |||
| #define FLOAT float | |||
| #define SIZE 2 | |||
| #define BASE_SHIFT 1 | |||
| #define ZBASE_SHIFT 2 | |||
| #else | |||
| #define FLOAT float | |||
| #define SIZE 4 | |||
| @@ -308,6 +320,10 @@ typedef int blasint; | |||
| #define XFLOAT FLOAT | |||
| #endif | |||
| #ifndef IFLOAT | |||
| #define IFLOAT FLOAT | |||
| #endif | |||
| #ifndef COMPLEX | |||
| #define COMPSIZE 1 | |||
| #else | |||
| @@ -344,13 +360,8 @@ typedef int blasint; | |||
| #endif | |||
| #endif | |||
| #ifdef POWER8 | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| #endif | |||
| #ifdef POWER9 | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #ifndef YIELDING | |||
| #define YIELDING __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop;\n"); | |||
| #endif | |||
| @@ -469,6 +469,8 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint | |||
| /* Level 3 routines */ | |||
| void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, | |||
| bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, | |||
| float *, blasint *, float *, blasint *, float *, float *, blasint *); | |||
| void BLASFUNC(dgemm)(char *, char *, blasint *, blasint *, blasint *, double *, | |||
| @@ -55,6 +55,8 @@ extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, | |||
| extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); | |||
| int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, | |||
| float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int dgemm_beta(BLASLONG, BLASLONG, BLASLONG, double, | |||
| @@ -76,6 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, | |||
| xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); | |||
| #endif | |||
| int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); | |||
| int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); | |||
| @@ -499,6 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl | |||
| int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); | |||
| int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); | |||
| int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); | |||
| @@ -527,6 +534,11 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float | |||
| int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); | |||
| int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); | |||
| int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| @@ -619,6 +631,11 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON | |||
| int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); | |||
| #endif | |||
| int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); | |||
| int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| int sgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); | |||
| @@ -39,6 +39,7 @@ | |||
| #ifndef COMMON_MACRO | |||
| #define COMMON_MACRO | |||
| #include "common_sh.h" | |||
| #include "common_s.h" | |||
| #include "common_d.h" | |||
| #include "common_q.h" | |||
| @@ -642,6 +643,288 @@ | |||
| #define IMATCOPY_K_RT DIMATCOPY_K_RT | |||
| #define GEADD_K DGEADD_K | |||
| #elif defined(HALF) | |||
| #define AMAX_K SAMAX_K | |||
| #define AMIN_K SAMIN_K | |||
| #define MAX_K SMAX_K | |||
| #define MIN_K SMIN_K | |||
| #define IAMAX_K ISAMAX_K | |||
| #define IAMIN_K ISAMIN_K | |||
| #define IMAX_K ISMAX_K | |||
| #define IMIN_K ISMIN_K | |||
| #define ASUM_K SASUM_K | |||
| #define DOTU_K SDOTU_K | |||
| #define DOTC_K SDOTC_K | |||
| #define AXPYU_K SAXPYU_K | |||
| #define AXPYC_K SAXPYC_K | |||
| #define AXPBY_K SAXPBY_K | |||
| #define SCAL_K SSCAL_K | |||
| #define GEMV_N SGEMV_N | |||
| #define GEMV_T SGEMV_T | |||
| #define SYMV_U SSYMV_U | |||
| #define SYMV_L SSYMV_L | |||
| #define GERU_K SGERU_K | |||
| #define GERC_K SGERC_K | |||
| #define GERV_K SGERV_K | |||
| #define GERD_K SGERD_K | |||
| #define SUM_K SSUM_K | |||
| #define SWAP_K SSWAP_K | |||
| #define ROT_K SROT_K | |||
| #define COPY_K SCOPY_K | |||
| #define NRM2_K SNRM2_K | |||
| #define SYMV_THREAD_U SSYMV_THREAD_U | |||
| #define SYMV_THREAD_L SSYMV_THREAD_L | |||
| #define GEMM_BETA SHGEMM_BETA | |||
| #define GEMM_KERNEL_N SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_L SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_R SHGEMM_KERNEL | |||
| #define GEMM_KERNEL_B SHGEMM_KERNEL | |||
| #define GEMM_NN SHGEMM_NN | |||
| #define GEMM_CN SHGEMM_TN | |||
| #define GEMM_TN SHGEMM_TN | |||
| #define GEMM_NC SHGEMM_NT | |||
| #define GEMM_NT SHGEMM_NT | |||
| #define GEMM_CC SHGEMM_TT | |||
| #define GEMM_CT SHGEMM_TT | |||
| #define GEMM_TC SHGEMM_TT | |||
| #define GEMM_TT SHGEMM_TT | |||
| #define GEMM_NR SHGEMM_NN | |||
| #define GEMM_TR SHGEMM_TN | |||
| #define GEMM_CR SHGEMM_TN | |||
| #define GEMM_RN SHGEMM_NN | |||
| #define GEMM_RT SHGEMM_NT | |||
| #define GEMM_RC SHGEMM_NT | |||
| #define GEMM_RR SHGEMM_NN | |||
| #define GEMM_ONCOPY SHGEMM_ONCOPY | |||
| #define GEMM_OTCOPY SHGEMM_OTCOPY | |||
| #define GEMM_INCOPY SHGEMM_INCOPY | |||
| #define GEMM_ITCOPY SHGEMM_ITCOPY | |||
| #define SYMM_THREAD_LU SSYMM_THREAD_LU | |||
| #define SYMM_THREAD_LL SSYMM_THREAD_LL | |||
| #define SYMM_THREAD_RU SSYMM_THREAD_RU | |||
| #define SYMM_THREAD_RL SSYMM_THREAD_RL | |||
| #define SYMM_LU SSYMM_LU | |||
| #define SYMM_LL SSYMM_LL | |||
| #define SYMM_RU SSYMM_RU | |||
| #define SYMM_RL SSYMM_RL | |||
| #define HEMM_THREAD_LU SHEMM_THREAD_LU | |||
| #define HEMM_THREAD_LL SHEMM_THREAD_LL | |||
| #define HEMM_THREAD_RU SHEMM_THREAD_RU | |||
| #define HEMM_THREAD_RL SHEMM_THREAD_RL | |||
| #define GEMM_THREAD_NN SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_CN SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_TN SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_NC SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_NT SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_CC SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_CT SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TC SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_TT SHGEMM_THREAD_TT | |||
| #define GEMM_THREAD_NR SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_TR SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_CR SHGEMM_THREAD_TN | |||
| #define GEMM_THREAD_RN SHGEMM_THREAD_NN | |||
| #define GEMM_THREAD_RT SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RC SHGEMM_THREAD_NT | |||
| #define GEMM_THREAD_RR SHGEMM_THREAD_NN | |||
| #ifdef UNIT | |||
| #define TRMM_OUNCOPY STRMM_OUNUCOPY | |||
| #define TRMM_OUTCOPY STRMM_OUTUCOPY | |||
| #define TRMM_OLNCOPY STRMM_OLNUCOPY | |||
| #define TRMM_OLTCOPY STRMM_OLTUCOPY | |||
| #define TRSM_OUNCOPY STRSM_OUNUCOPY | |||
| #define TRSM_OUTCOPY STRSM_OUTUCOPY | |||
| #define TRSM_OLNCOPY STRSM_OLNUCOPY | |||
| #define TRSM_OLTCOPY STRSM_OLTUCOPY | |||
| #define TRMM_IUNCOPY STRMM_IUNUCOPY | |||
| #define TRMM_IUTCOPY STRMM_IUTUCOPY | |||
| #define TRMM_ILNCOPY STRMM_ILNUCOPY | |||
| #define TRMM_ILTCOPY STRMM_ILTUCOPY | |||
| #define TRSM_IUNCOPY STRSM_IUNUCOPY | |||
| #define TRSM_IUTCOPY STRSM_IUTUCOPY | |||
| #define TRSM_ILNCOPY STRSM_ILNUCOPY | |||
| #define TRSM_ILTCOPY STRSM_ILTUCOPY | |||
| #else | |||
| #define TRMM_OUNCOPY STRMM_OUNNCOPY | |||
| #define TRMM_OUTCOPY STRMM_OUTNCOPY | |||
| #define TRMM_OLNCOPY STRMM_OLNNCOPY | |||
| #define TRMM_OLTCOPY STRMM_OLTNCOPY | |||
| #define TRSM_OUNCOPY STRSM_OUNNCOPY | |||
| #define TRSM_OUTCOPY STRSM_OUTNCOPY | |||
| #define TRSM_OLNCOPY STRSM_OLNNCOPY | |||
| #define TRSM_OLTCOPY STRSM_OLTNCOPY | |||
| #define TRMM_IUNCOPY STRMM_IUNNCOPY | |||
| #define TRMM_IUTCOPY STRMM_IUTNCOPY | |||
| #define TRMM_ILNCOPY STRMM_ILNNCOPY | |||
| #define TRMM_ILTCOPY STRMM_ILTNCOPY | |||
| #define TRSM_IUNCOPY STRSM_IUNNCOPY | |||
| #define TRSM_IUTCOPY STRSM_IUTNCOPY | |||
| #define TRSM_ILNCOPY STRSM_ILNNCOPY | |||
| #define TRSM_ILTCOPY STRSM_ILTNCOPY | |||
| #define TRMM_KERNEL_LN STRMM_KERNEL_LN | |||
| #define TRMM_KERNEL_LT STRMM_KERNEL_LT | |||
| #define TRMM_KERNEL_LR STRMM_KERNEL_LN | |||
| #define TRMM_KERNEL_LC STRMM_KERNEL_LT | |||
| #define TRMM_KERNEL_RN STRMM_KERNEL_RN | |||
| #define TRMM_KERNEL_RT STRMM_KERNEL_RT | |||
| #define TRMM_KERNEL_RR STRMM_KERNEL_RN | |||
| #define TRMM_KERNEL_RC STRMM_KERNEL_RT | |||
| #define TRSM_KERNEL_LN STRSM_KERNEL_LN | |||
| #define TRSM_KERNEL_LT STRSM_KERNEL_LT | |||
| #define TRSM_KERNEL_LR STRSM_KERNEL_LN | |||
| #define TRSM_KERNEL_LC STRSM_KERNEL_LT | |||
| #define TRSM_KERNEL_RN STRSM_KERNEL_RN | |||
| #define TRSM_KERNEL_RT STRSM_KERNEL_RT | |||
| #define TRSM_KERNEL_RR STRSM_KERNEL_RN | |||
| #define TRSM_KERNEL_RC STRSM_KERNEL_RT | |||
| #define SYMM_IUTCOPY SSYMM_IUTCOPY | |||
| #define SYMM_ILTCOPY SSYMM_ILTCOPY | |||
| #define SYMM_OUTCOPY SSYMM_OUTCOPY | |||
| #define SYMM_OLTCOPY SSYMM_OLTCOPY | |||
| #define TRMM_LNUU STRMM_LNUU | |||
| #define TRMM_LNUN STRMM_LNUN | |||
| #define TRMM_LNLU STRMM_LNLU | |||
| #define TRMM_LNLN STRMM_LNLN | |||
| #define TRMM_LTUU STRMM_LTUU | |||
| #define TRMM_LTUN STRMM_LTUN | |||
| #define TRMM_LTLU STRMM_LTLU | |||
| #define TRMM_LTLN STRMM_LTLN | |||
| #define TRMM_LRUU STRMM_LNUU | |||
| #define TRMM_LRUN STRMM_LNUN | |||
| #define TRMM_LRLU STRMM_LNLU | |||
| #define TRMM_LRLN STRMM_LNLN | |||
| #define TRMM_LCUU STRMM_LTUU | |||
| #define TRMM_LCUN STRMM_LTUN | |||
| #define TRMM_LCLU STRMM_LTLU | |||
| #define TRMM_LCLN STRMM_LTLN | |||
| #define TRMM_RNUU STRMM_RNUU | |||
| #define TRMM_RNUN STRMM_RNUN | |||
| #define TRMM_RNLU STRMM_RNLU | |||
| #define TRMM_RNLN STRMM_RNLN | |||
| #define TRMM_RTUU STRMM_RTUU | |||
| #define TRMM_RTUN STRMM_RTUN | |||
| #define TRMM_RTLU STRMM_RTLU | |||
| #define TRMM_RTLN STRMM_RTLN | |||
| #define TRMM_RRUU STRMM_RNUU | |||
| #define TRMM_RRUN STRMM_RNUN | |||
| #define TRMM_RRLU STRMM_RNLU | |||
| #define TRMM_RRLN STRMM_RNLN | |||
| #define TRMM_RCUU STRMM_RTUU | |||
| #define TRMM_RCUN STRMM_RTUN | |||
| #define TRMM_RCLU STRMM_RTLU | |||
| #define TRMM_RCLN STRMM_RTLN | |||
| #define TRSM_LNUU STRSM_LNUU | |||
| #define TRSM_LNUN STRSM_LNUN | |||
| #define TRSM_LNLU STRSM_LNLU | |||
| #define TRSM_LNLN STRSM_LNLN | |||
| #define TRSM_LTUU STRSM_LTUU | |||
| #define TRSM_LTUN STRSM_LTUN | |||
| #define TRSM_LTLU STRSM_LTLU | |||
| #define TRSM_LTLN STRSM_LTLN | |||
| #define TRSM_LRUU STRSM_LNUU | |||
| #define TRSM_LRUN STRSM_LNUN | |||
| #define TRSM_LRLU STRSM_LNLU | |||
| #define TRSM_LRLN STRSM_LNLN | |||
| #define TRSM_LCUU STRSM_LTUU | |||
| #define TRSM_LCUN STRSM_LTUN | |||
| #define TRSM_LCLU STRSM_LTLU | |||
| #define TRSM_LCLN STRSM_LTLN | |||
| #define TRSM_RNUU STRSM_RNUU | |||
| #define TRSM_RNUN STRSM_RNUN | |||
| #define TRSM_RNLU STRSM_RNLU | |||
| #define TRSM_RNLN STRSM_RNLN | |||
| #define TRSM_RTUU STRSM_RTUU | |||
| #define TRSM_RTUN STRSM_RTUN | |||
| #define TRSM_RTLU STRSM_RTLU | |||
| #define TRSM_RTLN STRSM_RTLN | |||
| #define TRSM_RRUU STRSM_RNUU | |||
| #define TRSM_RRUN STRSM_RNUN | |||
| #define TRSM_RRLU STRSM_RNLU | |||
| #define TRSM_RRLN STRSM_RNLN | |||
| #define TRSM_RCUU STRSM_RTUU | |||
| #define TRSM_RCUN STRSM_RTUN | |||
| #define TRSM_RCLU STRSM_RTLU | |||
| #define TRSM_RCLN STRSM_RTLN | |||
| #define SYRK_UN SSYRK_UN | |||
| #define SYRK_UT SSYRK_UT | |||
| #define SYRK_LN SSYRK_LN | |||
| #define SYRK_LT SSYRK_LT | |||
| #define SYRK_UR SSYRK_UN | |||
| #define SYRK_UC SSYRK_UT | |||
| #define SYRK_LR SSYRK_LN | |||
| #define SYRK_LC SSYRK_LT | |||
| #define SYRK_KERNEL_U SSYRK_KERNEL_U | |||
| #define SYRK_KERNEL_L SSYRK_KERNEL_L | |||
| #define HERK_UN SSYRK_UN | |||
| #define HERK_LN SSYRK_LN | |||
| #define HERK_UC SSYRK_UT | |||
| #define HERK_LC SSYRK_LT | |||
| #define HER2K_UN SSYR2K_UN | |||
| #define HER2K_LN SSYR2K_LN | |||
| #define HER2K_UC SSYR2K_UT | |||
| #define HER2K_LC SSYR2K_LT | |||
| #define SYR2K_UN SSYR2K_UN | |||
| #define SYR2K_UT SSYR2K_UT | |||
| #define SYR2K_LN SSYR2K_LN | |||
| #define SYR2K_LT SSYR2K_LT | |||
| #define SYR2K_UR SSYR2K_UN | |||
| #define SYR2K_UC SSYR2K_UT | |||
| #define SYR2K_LR SSYR2K_LN | |||
| #define SYR2K_LC SSYR2K_LT | |||
| #define SYR2K_KERNEL_U SSYR2K_KERNEL_U | |||
| #define SYR2K_KERNEL_L SSYR2K_KERNEL_L | |||
| #define SYRK_THREAD_UN SSYRK_THREAD_UN | |||
| #define SYRK_THREAD_UT SSYRK_THREAD_UT | |||
| #define SYRK_THREAD_LN SSYRK_THREAD_LN | |||
| #define SYRK_THREAD_LT SSYRK_THREAD_LT | |||
| #define SYRK_THREAD_UR SSYRK_THREAD_UR | |||
| #define SYRK_THREAD_UC SSYRK_THREAD_UC | |||
| #define SYRK_THREAD_LR SSYRK_THREAD_LN | |||
| #define SYRK_THREAD_LC SSYRK_THREAD_LT | |||
| #define HERK_THREAD_UN SSYRK_THREAD_UN | |||
| #define HERK_THREAD_UT SSYRK_THREAD_UT | |||
| #define HERK_THREAD_LN SSYRK_THREAD_LN | |||
| #define HERK_THREAD_LT SSYRK_THREAD_LT | |||
| #define HERK_THREAD_UR SSYRK_THREAD_UR | |||
| #define HERK_THREAD_UC SSYRK_THREAD_UC | |||
| #define HERK_THREAD_LR SSYRK_THREAD_LN | |||
| #define HERK_THREAD_LC SSYRK_THREAD_LT | |||
| #define OMATCOPY_K_CN SOMATCOPY_K_CN | |||
| #define OMATCOPY_K_RN SOMATCOPY_K_RN | |||
| #define OMATCOPY_K_CT SOMATCOPY_K_CT | |||
| #define OMATCOPY_K_RT SOMATCOPY_K_RT | |||
| #define IMATCOPY_K_CN SIMATCOPY_K_CN | |||
| #define IMATCOPY_K_RN SIMATCOPY_K_RN | |||
| #define IMATCOPY_K_CT SIMATCOPY_K_CT | |||
| #define IMATCOPY_K_RT SIMATCOPY_K_RT | |||
| #define GEADD_K SGEADD_K | |||
| #endif | |||
| #else | |||
| #define AMAX_K SAMAX_K | |||
| @@ -673,14 +956,14 @@ | |||
| #define GEMV_S SGEMV_S | |||
| #define GEMV_D SGEMV_D | |||
| #define SYMV_U SSYMV_U | |||
| #define SYMV_L SSYMV_L | |||
| #define GERU_K SGERU_K | |||
| #define GERC_K SGERC_K | |||
| #define GERV_K SGERV_K | |||
| #define GERD_K SGERD_K | |||
| #define SYMV_U SSYMV_U | |||
| #define SYMV_L SSYMV_L | |||
| #define SYMV_THREAD_U SSYMV_THREAD_U | |||
| #define SYMV_THREAD_L SSYMV_THREAD_L | |||
| @@ -2202,6 +2485,9 @@ | |||
| #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) | |||
| extern BLASLONG gemm_offset_a; | |||
| extern BLASLONG gemm_offset_b; | |||
| extern BLASLONG shgemm_p; | |||
| extern BLASLONG shgemm_q; | |||
| extern BLASLONG shgemm_r; | |||
| extern BLASLONG sgemm_p; | |||
| extern BLASLONG sgemm_q; | |||
| extern BLASLONG sgemm_r; | |||
| @@ -43,6 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #ifndef ASSEMBLER | |||
| #if !defined(MIPS24K) | |||
| static inline unsigned int rpcc(void){ | |||
| unsigned long ret; | |||
| @@ -53,6 +54,7 @@ static inline unsigned int rpcc(void){ | |||
| return ret; | |||
| } | |||
| #define RPCC_DEFINED | |||
| #endif | |||
| static inline int blas_quickdivide(blasint x, blasint y){ | |||
| return x / y; | |||
| @@ -92,7 +94,7 @@ REALNAME: | |||
| #endif | |||
| #define HUGE_PAGESIZE ( 4 << 20) | |||
| #define BUFFER_SIZE (16 << 20) | |||
| #define BUFFER_SIZE (16 << 21) | |||
| #define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER) | |||
| @@ -227,7 +227,7 @@ REALNAME: ;\ | |||
| #define SEEK_ADDRESS | |||
| #define BUFFER_SIZE ( 32 << 20) | |||
| #define BUFFER_SIZE ( 32 << 21) | |||
| #if defined(LOONGSON3A) | |||
| #define PAGESIZE (16UL << 10) | |||
| @@ -47,6 +47,100 @@ typedef struct { | |||
| int dtb_entries; | |||
| int offsetA, offsetB, align; | |||
| #ifdef BUILD_HALF | |||
| int shgemm_p, shgemm_q, shgemm_r; | |||
| int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; | |||
| float (*shamax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shamin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shmax_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shmin_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); | |||
| BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shnrm2_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shasum_k) (BLASLONG, float *, BLASLONG); | |||
| float (*shsum_k) (BLASLONG, float *, BLASLONG); | |||
| int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); | |||
| int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); | |||
| int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); | |||
| int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); | |||
| int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); | |||
| int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); | |||
| int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); | |||
| #endif | |||
| int sgemm_p, sgemm_q, sgemm_r; | |||
| int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; | |||
| @@ -84,6 +178,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); | |||
| int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); | |||
| int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); | |||
| int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); | |||
| @@ -907,6 +1002,15 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 gotoblas -> exclusive_cache | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P gotoblas -> shgemm_p | |||
| #define SHGEMM_Q gotoblas -> shgemm_q | |||
| #define SHGEMM_R gotoblas -> shgemm_r | |||
| #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m | |||
| #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n | |||
| #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn | |||
| #endif | |||
| #define SGEMM_P gotoblas -> sgemm_p | |||
| #define SGEMM_Q gotoblas -> sgemm_q | |||
| #define SGEMM_R gotoblas -> sgemm_r | |||
| @@ -984,6 +1088,19 @@ extern gotoblas_t *gotoblas; | |||
| #define HAVE_EX_L2 0 | |||
| #endif | |||
| #ifdef BUILD_HALF | |||
| #define SHGEMM_P SHGEMM_DEFAULT_P | |||
| #define SHGEMM_Q SHGEMM_DEFAULT_Q | |||
| #define SHGEMM_R SHGEMM_DEFAULT_R | |||
| #define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M | |||
| #define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N | |||
| #ifdef SHGEMM_DEFAULT_UNROLL_MN | |||
| #define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN | |||
| #else | |||
| #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) | |||
| #endif | |||
| #endif | |||
| #define SGEMM_P SGEMM_DEFAULT_P | |||
| #define SGEMM_Q SGEMM_DEFAULT_Q | |||
| #define SGEMM_R SGEMM_DEFAULT_R | |||
| @@ -1119,6 +1236,18 @@ extern gotoblas_t *gotoblas; | |||
| #define GEMM_DEFAULT_R DGEMM_DEFAULT_R | |||
| #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M | |||
| #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N | |||
| #elif defined(HALF) | |||
| #define GEMM_P SHGEMM_P | |||
| #define GEMM_Q SHGEMM_Q | |||
| #define GEMM_R SHGEMM_R | |||
| #define GEMM_UNROLL_M SHGEMM_UNROLL_M | |||
| #define GEMM_UNROLL_N SHGEMM_UNROLL_N | |||
| #define GEMM_UNROLL_MN SHGEMM_UNROLL_MN | |||
| #define GEMM_DEFAULT_P SHGEMM_DEFAULT_P | |||
| #define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q | |||
| #define GEMM_DEFAULT_R SHGEMM_DEFAULT_R | |||
| #define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M | |||
| #define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N | |||
| #else | |||
| #define GEMM_P SGEMM_P | |||
| #define GEMM_Q SGEMM_Q | |||
| @@ -1204,28 +1333,32 @@ extern gotoblas_t *gotoblas; | |||
| #define GEMM_THREAD gemm_thread_n | |||
| #endif | |||
| #ifndef SHGEMM_DEFAULT_R | |||
| #define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SGEMM_DEFAULT_R | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) | |||
| #define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef DGEMM_DEFAULT_R | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef QGEMM_DEFAULT_R | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef CGEMM_DEFAULT_R | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) | |||
| #define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef ZGEMM_DEFAULT_R | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) | |||
| #define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef XGEMM_DEFAULT_R | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) | |||
| #define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) | |||
| #endif | |||
| #ifndef SNUMOPT | |||
| @@ -68,7 +68,7 @@ | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define MB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define WMB __asm__ __volatile__ ("eieio":::"memory") | |||
| #define RMB __asm__ __volatile__ ("eieio":::"memory") | |||
| @@ -272,7 +272,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define HAVE_PREFETCH | |||
| #endif | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(PPC970) | |||
| #if defined(POWER3) || defined(POWER6) || defined(PPCG4) || defined(CELL) || defined(POWER8) || defined(POWER9) || defined(POWER10) || defined(PPC970) | |||
| #define DCBT_ARG 0 | |||
| #else | |||
| #define DCBT_ARG 8 | |||
| @@ -294,7 +294,7 @@ static inline int blas_quickdivide(blasint x, blasint y){ | |||
| #define L1_PREFETCH dcbtst | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define L1_DUALFETCH | |||
| #define L1_PREFETCHSIZE (16 + 128 * 100) | |||
| #define L1_PREFETCH dcbtst | |||
| @@ -843,7 +843,7 @@ Lmcount$lazy_ptr: | |||
| #define BUFFER_SIZE ( 2 << 20) | |||
| #elif defined(PPC440FP2) | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| #elif defined(POWER8) || defined(POWER9) | |||
| #elif defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #define BUFFER_SIZE ( 64 << 20) | |||
| #else | |||
| #define BUFFER_SIZE ( 16 << 20) | |||
| @@ -0,0 +1,65 @@ | |||
| #ifndef COMMON_SH_H | |||
| #define COMMON_SH_H | |||
| #ifndef DYNAMIC_ARCH | |||
| #define SHGEMM_ONCOPY shgemm_oncopy | |||
| #define SHGEMM_OTCOPY shgemm_otcopy | |||
| #if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N | |||
| #define SHGEMM_INCOPY shgemm_oncopy | |||
| #define SHGEMM_ITCOPY shgemm_otcopy | |||
| #else | |||
| #define SHGEMM_INCOPY shgemm_incopy | |||
| #define SHGEMM_ITCOPY shgemm_itcopy | |||
| #endif | |||
| #define SHGEMM_BETA shgemm_beta | |||
| #define SHGEMM_KERNEL shgemm_kernel | |||
| #else | |||
| #define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy | |||
| #define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy | |||
| #define SHGEMM_INCOPY gotoblas -> shgemm_incopy | |||
| #define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy | |||
| #define SHGEMM_BETA gotoblas -> shgemm_beta | |||
| #define SHGEMM_KERNEL gotoblas -> shgemm_kernel | |||
| #endif | |||
| #define SHGEMM_NN shgemm_nn | |||
| #define SHGEMM_CN shgemm_tn | |||
| #define SHGEMM_TN shgemm_tn | |||
| #define SHGEMM_NC shgemm_nt | |||
| #define SHGEMM_NT shgemm_nt | |||
| #define SHGEMM_CC shgemm_tt | |||
| #define SHGEMM_CT shgemm_tt | |||
| #define SHGEMM_TC shgemm_tt | |||
| #define SHGEMM_TT shgemm_tt | |||
| #define SHGEMM_NR shgemm_nn | |||
| #define SHGEMM_TR shgemm_tn | |||
| #define SHGEMM_CR shgemm_tn | |||
| #define SHGEMM_RN shgemm_nn | |||
| #define SHGEMM_RT shgemm_nt | |||
| #define SHGEMM_RC shgemm_nt | |||
| #define SHGEMM_RR shgemm_nn | |||
| #define SHGEMM_THREAD_NN shgemm_thread_nn | |||
| #define SHGEMM_THREAD_CN shgemm_thread_tn | |||
| #define SHGEMM_THREAD_TN shgemm_thread_tn | |||
| #define SHGEMM_THREAD_NC shgemm_thread_nt | |||
| #define SHGEMM_THREAD_NT shgemm_thread_nt | |||
| #define SHGEMM_THREAD_CC shgemm_thread_tt | |||
| #define SHGEMM_THREAD_CT shgemm_thread_tt | |||
| #define SHGEMM_THREAD_TC shgemm_thread_tt | |||
| #define SHGEMM_THREAD_TT shgemm_thread_tt | |||
| #define SHGEMM_THREAD_NR shgemm_thread_nn | |||
| #define SHGEMM_THREAD_TR shgemm_thread_tn | |||
| #define SHGEMM_THREAD_CR shgemm_thread_tn | |||
| #define SHGEMM_THREAD_RN shgemm_thread_nn | |||
| #define SHGEMM_THREAD_RT shgemm_thread_nt | |||
| #define SHGEMM_THREAD_RC shgemm_thread_nt | |||
| #define SHGEMM_THREAD_RR shgemm_thread_nn | |||
| #endif | |||
| @@ -80,7 +80,7 @@ static void __inline blas_lock(volatile BLASULONG *address){ | |||
| #endif | |||
| do { | |||
| while (*address) {YIELDING;}; | |||
| while (*address) {YIELDING;} | |||
| #ifndef C_MSVC | |||
| __asm__ __volatile__( | |||
| @@ -199,9 +199,9 @@ static __inline BLASLONG blas_quickdivide(BLASLONG x, BLASLONG y){ | |||
| #else | |||
| extern unsigned int blas_quick_divide_table[]; | |||
| static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| static __inline unsigned int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| unsigned int result; | |||
| volatile unsigned int result; | |||
| if (y <= 1) return x; | |||
| @@ -215,7 +215,6 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ | |||
| y = blas_quick_divide_table[y]; | |||
| __asm__ __volatile__ ("mull %0" :"=d" (result), "+a"(x) : "0" (y)); | |||
| return result; | |||
| } | |||
| #endif | |||
| @@ -5,6 +5,14 @@ inline void pauser(){ | |||
| std::getline(std::cin, dummy); | |||
| } | |||
| void FailIfThreadsAreZero(uint32_t numConcurrentThreads) { | |||
| if(numConcurrentThreads == 0) { | |||
| std::cout<<"ERROR: Invalid parameter 0 for number of concurrent calls into OpenBLAS!"<<std::endl; | |||
| std::cout<<"CBLAS DGEMV thread safety test FAILED!"<<std::endl; | |||
| exit(-1); | |||
| } | |||
| } | |||
| void FillMatrices(std::vector<std::vector<double>>& matBlock, std::mt19937_64& PRNG, std::uniform_real_distribution<double>& rngdist, const blasint randomMatSize, const uint32_t numConcurrentThreads, const uint32_t numMat){ | |||
| for(uint32_t i=0; i<numMat; i++){ | |||
| for(uint32_t j = 0; j < static_cast<uint32_t>(randomMatSize*randomMatSize); j++){ | |||
| @@ -46,6 +46,8 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||
| std::cout<<"This test will need "<<(static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*3*8)/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||
| FailIfThreadsAreZero(numConcurrentThreads); | |||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||
| std::mt19937_64 PRNG = InitPRNG(); | |||
| @@ -18,7 +18,7 @@ int main(int argc, char* argv[]){ | |||
| uint32_t maxHwThreads = omp_get_max_threads(); | |||
| if (maxHwThreads < 52) | |||
| numConcurrentThreads = maxHwThreads -4; | |||
| numConcurrentThreads = maxHwThreads; | |||
| if (argc > 4){ | |||
| std::cout<<"ERROR: too many arguments for thread safety tester"<<std::endl; | |||
| @@ -47,6 +47,8 @@ int main(int argc, char* argv[]){ | |||
| std::cout<<"Number of concurrent calls into OpenBLAS : "<<numConcurrentThreads<<'\n'; | |||
| std::cout<<"Number of testing rounds : "<<numTestRounds<<'\n'; | |||
| std::cout<<"This test will need "<<((static_cast<uint64_t>(randomMatSize*randomMatSize)*numConcurrentThreads*8)+(static_cast<uint64_t>(randomMatSize)*numConcurrentThreads*8*2))/static_cast<double>(1024*1024)<<" MiB of RAM\n"<<std::endl; | |||
| FailIfThreadsAreZero(numConcurrentThreads); | |||
| std::cout<<"Initializing random number generator..."<<std::flush; | |||
| std::mt19937_64 PRNG = InitPRNG(); | |||
| @@ -73,11 +73,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CPU_UNKNOWN 0 | |||
| #define CPU_P5600 1 | |||
| #define CPU_1004K 2 | |||
| #define CPU_24K 3 | |||
| static char *cpuname[] = { | |||
| "UNKNOWN", | |||
| "P5600", | |||
| "1004K" | |||
| "MIPS1004K", | |||
| "MIPS24K" | |||
| }; | |||
| int detect(void){ | |||
| @@ -105,6 +107,8 @@ int detect(void){ | |||
| return CPU_P5600; | |||
| } else if (strstr(p, "1004K")) { | |||
| return CPU_1004K; | |||
| } else if (strstr(p, " 24K")) { | |||
| return CPU_24K; | |||
| } else | |||
| return CPU_UNKNOWN; | |||
| } | |||
| @@ -121,7 +125,7 @@ void get_architecture(void){ | |||
| } | |||
| void get_subarchitecture(void){ | |||
| if(detect()==CPU_P5600|| detect()==CPU_1004K){ | |||
| if(detect()==CPU_P5600|| detect()==CPU_1004K|| detect()==CPU_24K){ | |||
| printf("P5600"); | |||
| }else{ | |||
| printf("UNKNOWN"); | |||
| @@ -146,7 +150,15 @@ void get_cpuconfig(void){ | |||
| printf("#define MIPS1004K\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 26144\n"); | |||
| printf("#define L2_SIZE 262144\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 8\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| } else if (detect()==CPU_24K) { | |||
| printf("#define MIPS24K\n"); | |||
| printf("#define L1_DATA_SIZE 32768\n"); | |||
| printf("#define L1_DATA_LINESIZE 32\n"); | |||
| printf("#define L2_SIZE 32768\n"); | |||
| printf("#define DTB_DEFAULT_ENTRIES 8\n"); | |||
| printf("#define DTB_SIZE 4096\n"); | |||
| printf("#define L2_ASSOCIATIVE 4\n"); | |||
| @@ -159,7 +171,9 @@ void get_libname(void){ | |||
| if(detect()==CPU_P5600) { | |||
| printf("p5600\n"); | |||
| } else if (detect()==CPU_1004K) { | |||
| printf("1004K\n"); | |||
| printf("mips1004K\n"); | |||
| } else if (detect()==CPU_24K) { | |||
| printf("mips24K\n"); | |||
| }else{ | |||
| printf("mips\n"); | |||
| } | |||
| @@ -57,6 +57,7 @@ | |||
| #define CPUTYPE_PPCG4 7 | |||
| #define CPUTYPE_POWER8 8 | |||
| #define CPUTYPE_POWER9 9 | |||
| #define CPUTYPE_POWER10 10 | |||
| char *cpuname[] = { | |||
| "UNKNOWN", | |||
| @@ -68,7 +69,8 @@ char *cpuname[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| char *lowercpuname[] = { | |||
| @@ -81,7 +83,8 @@ char *lowercpuname[] = { | |||
| "cell", | |||
| "ppcg4", | |||
| "power8", | |||
| "power9" | |||
| "power9", | |||
| "power10" | |||
| }; | |||
| char *corename[] = { | |||
| @@ -94,7 +97,8 @@ char *corename[] = { | |||
| "CELL", | |||
| "PPCG4", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| int detect(void){ | |||
| @@ -125,6 +129,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| @@ -157,6 +162,7 @@ int detect(void){ | |||
| if (!strncasecmp(p, "POWER7", 6)) return CPUTYPE_POWER6; | |||
| if (!strncasecmp(p, "POWER8", 6)) return CPUTYPE_POWER8; | |||
| if (!strncasecmp(p, "POWER9", 6)) return CPUTYPE_POWER9; | |||
| if (!strncasecmp(p, "POWER10", 7)) return CPUTYPE_POWER10; | |||
| if (!strncasecmp(p, "Cell", 4)) return CPUTYPE_CELL; | |||
| if (!strncasecmp(p, "7447", 4)) return CPUTYPE_PPCG4; | |||
| return CPUTYPE_POWER5; | |||
| @@ -179,6 +185,9 @@ int detect(void){ | |||
| int id; | |||
| __asm __volatile("mfpvr %0" : "=r"(id)); | |||
| switch ( id >> 16 ) { | |||
| case 0x80: // POWER10 | |||
| return CPUTYPE_POWER10; | |||
| break; | |||
| case 0x4e: // POWER9 | |||
| return CPUTYPE_POWER9; | |||
| break; | |||
| @@ -1406,6 +1406,17 @@ int get_cpuname(void){ | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| case 10: //family 6 exmodel 10 | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx2()) | |||
| return CPUTYPE_HASWELL; | |||
| if(support_avx()) | |||
| return CPUTYPE_SANDYBRIDGE; | |||
| else | |||
| return CPUTYPE_NEHALEM; | |||
| } | |||
| break; | |||
| } | |||
| @@ -1955,6 +1966,19 @@ int get_coretype(void){ | |||
| return CORE_NEHALEM; | |||
| } | |||
| break; | |||
| case 10: | |||
| switch (model) { | |||
| case 5: // Comet Lake H and S | |||
| case 6: // Comet Lake U | |||
| if(support_avx()) | |||
| #ifndef NO_AVX2 | |||
| return CORE_HASWELL; | |||
| #else | |||
| return CORE_SANDYBRIDGE; | |||
| #endif | |||
| else | |||
| return CORE_NEHALEM; | |||
| } | |||
| case 5: | |||
| switch (model) { | |||
| case 6: | |||
| @@ -12,6 +12,9 @@ FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/test_cblas_helper.sh | |||
| foreach(float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char_upper) | |||
| string(TOLOWER ${float_char_upper} float_char) | |||
| if (${float_char} STREQUAL "h") | |||
| continue() | |||
| endif() | |||
| #level1 | |||
| add_executable(x${float_char}cblat1 | |||
| c_${float_char}blat1.f | |||
| @@ -19,6 +19,10 @@ ifeq ($(ARCH), MIPS) | |||
| USE_GEMM3M = 1 | |||
| endif | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ | |||
| strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ | |||
| @@ -203,7 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( | |||
| COMMONOBJS += syrk_thread.$(SUFFIX) | |||
| ifndef USE_SIMPLE_THREADED_LEVEL3 | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) | |||
| DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) | |||
| QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) | |||
| @@ -283,6 +289,18 @@ endif | |||
| all :: | |||
| shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| @@ -478,6 +496,17 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h | |||
| beta_thread.$(SUFFIX) : beta_thread.c ../../common.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| @@ -2652,6 +2681,18 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c | |||
| xtrsm_RCLN.$(SUFFIX) : trsm_R.c | |||
| $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) | |||
| shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| @@ -2848,6 +2889,18 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h | |||
| $(CC) -c $(PFLAGS) $< -o $(@F) | |||
| shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) | |||
| shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) | |||
| shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) | |||
| sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h | |||
| $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) | |||
| @@ -62,18 +62,18 @@ | |||
| #ifndef ICOPY_OPERATION | |||
| #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ | |||
| defined(RN) || defined(RT) || defined(RC) || defined(RR) | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef OCOPY_OPERATION | |||
| #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ | |||
| defined(NR) || defined(TR) || defined(CR) || defined(RR) | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| @@ -173,7 +173,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ | |||
| BLASLONG k, lda, ldb, ldc; | |||
| FLOAT *alpha, *beta; | |||
| FLOAT *a, *b, *c; | |||
| IFLOAT *a, *b; | |||
| FLOAT *c; | |||
| BLASLONG m_from, m_to, n_from, n_to; | |||
| BLASLONG ls, is, js; | |||
| @@ -198,8 +199,8 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| k = K; | |||
| a = (FLOAT *)A; | |||
| b = (FLOAT *)B; | |||
| a = (IFLOAT *)A; | |||
| b = (IFLOAT *)B; | |||
| c = (FLOAT *)C; | |||
| lda = LDA; | |||
| @@ -117,18 +117,18 @@ typedef struct { | |||
| #ifndef ICOPY_OPERATION | |||
| #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ | |||
| defined(RN) || defined(RT) || defined(RC) || defined(RR) | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| #ifndef OCOPY_OPERATION | |||
| #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ | |||
| defined(NR) || defined(TR) || defined(CR) || defined(RR) | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (FLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #else | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (FLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | |||
| #endif | |||
| #endif | |||
| @@ -219,15 +219,16 @@ typedef struct { | |||
| #define STOP_RPCC(COUNTER) | |||
| #endif | |||
| static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
| static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ | |||
| FLOAT *buffer[DIVIDE_RATE]; | |||
| IFLOAT *buffer[DIVIDE_RATE]; | |||
| BLASLONG k, lda, ldb, ldc; | |||
| BLASLONG m_from, m_to, n_from, n_to; | |||
| FLOAT *alpha, *beta; | |||
| FLOAT *a, *b, *c; | |||
| IFLOAT *a, *b; | |||
| FLOAT *c; | |||
| job_t *job = (job_t *)args -> common; | |||
| BLASLONG nthreads_m; | |||
| @@ -255,8 +256,8 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| k = K; | |||
| a = (FLOAT *)A; | |||
| b = (FLOAT *)B; | |||
| a = (IFLOAT *)A; | |||
| b = (IFLOAT *)B; | |||
| c = (FLOAT *)C; | |||
| lda = LDA; | |||
| @@ -425,7 +426,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Apply kernel with local region of A and part of other region of B */ | |||
| START_RPCC(); | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, ldc, m_from, js); | |||
| STOP_RPCC(kernel); | |||
| @@ -469,7 +470,7 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | |||
| /* Apply kernel with local region of A and part of region of B */ | |||
| START_RPCC(); | |||
| KERNEL_OPERATION(min_i, MIN(range_n[current + 1] - js, div_n), min_l, alpha, | |||
| sa, (FLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| sa, (IFLOAT *)job[current].working[mypos][CACHE_LINE_SIZE * bufferside], | |||
| c, ldc, is, js); | |||
| STOP_RPCC(kernel); | |||
| @@ -532,7 +533,7 @@ static int round_up(int remainder, int width, int multiple) | |||
| static int gemm_driver(blas_arg_t *args, BLASLONG *range_m, BLASLONG | |||
| *range_n, FLOAT *sa, FLOAT *sb, | |||
| *range_n, IFLOAT *sa, IFLOAT *sb, | |||
| BLASLONG nthreads_m, BLASLONG nthreads_n) { | |||
| #ifndef USE_OPENMP | |||
| @@ -728,7 +729,7 @@ EnterCriticalSection((PCRITICAL_SECTION)&level3_lock); | |||
| return 0; | |||
| } | |||
| int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLOAT *sb, BLASLONG mypos){ | |||
| int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, IFLOAT *sa, IFLOAT *sb, BLASLONG mypos){ | |||
| BLASLONG m = args -> m; | |||
| BLASLONG n = args -> n; | |||
| @@ -272,7 +272,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ | |||
| } | |||
| } | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| #if defined(OS_LINUX) && !defined(NO_AFFINITY) | |||
| int gotoblas_set_affinity(int); | |||
| int gotoblas_set_affinity2(int); | |||
| int get_node(void); | |||
| @@ -281,6 +281,8 @@ int get_node(void); | |||
| static int increased_threads = 0; | |||
| #ifdef OS_LINUX | |||
| extern int openblas_get_num_threads(void); | |||
| int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) { | |||
| const int active_threads = openblas_get_num_threads(); | |||
| @@ -602,7 +604,7 @@ int blas_thread_init(void){ | |||
| if(ret!=0){ | |||
| struct rlimit rlim; | |||
| const char *msg = strerror(ret); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %ld: %s\n", i+1,blas_num_threads,msg); | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: pthread_create failed for thread %ld of %d: %s\n", i+1,blas_num_threads,msg); | |||
| #ifdef RLIMIT_NPROC | |||
| if(0 == getrlimit(RLIMIT_NPROC, &rlim)) { | |||
| fprintf(STDERR, "OpenBLAS blas_thread_init: RLIMIT_NPROC " | |||
| @@ -332,7 +332,7 @@ int support_avx512(){ | |||
| if((ebx & (1<<7)) == 0){ | |||
| ret=0; //OS does not even support AVX2 | |||
| } | |||
| if((ebx & (1<<31)) != 0){ | |||
| if((ebx & (1u<<31)) != 0){ | |||
| xgetbv(0, &eax, &edx); | |||
| if((eax & 0xe0) == 0xe0) | |||
| ret=1; //OS supports AVX512VL | |||
| @@ -618,6 +618,18 @@ static gotoblas_t *get_coretype(void){ | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| case 10: | |||
| if (model == 5 || model == 6) { | |||
| if(support_avx2()) | |||
| return &gotoblas_HASWELL; | |||
| if(support_avx()) { | |||
| openblas_warning(FALLBACK_VERBOSE, SANDYBRIDGE_FALLBACK); | |||
| return &gotoblas_SANDYBRIDGE; | |||
| } else { | |||
| openblas_warning(FALLBACK_VERBOSE, NEHALEM_FALLBACK); | |||
| return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. | |||
| } | |||
| } | |||
| return NULL; | |||
| } | |||
| case 0xf: | |||
| @@ -632,7 +644,7 @@ static gotoblas_t *get_coretype(void){ | |||
| cpuid(0x80000000, &eax, &ebx, &ecx, &edx); | |||
| if ( (eax & 0xffff) >= 0x01) { | |||
| cpuid(0x80000001, &eax, &ebx, &ecx, &edx); | |||
| if ((edx & (1 << 30)) == 0 || (edx & (1 << 31)) == 0) | |||
| if ((edx & (1 << 30)) == 0 || (edx & (1u << 31)) == 0) | |||
| return NULL; | |||
| } | |||
| else | |||
| @@ -764,18 +776,53 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_NORTHWOOD) return corename[ 3]; | |||
| if (gotoblas == &gotoblas_PRESCOTT) return corename[ 4]; | |||
| if (gotoblas == &gotoblas_BANIAS) return corename[ 5]; | |||
| if (gotoblas == &gotoblas_ATOM) return corename[ 6]; | |||
| if (gotoblas == &gotoblas_ATOM) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 6]; | |||
| #else | |||
| return corename[10]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_CORE2) return corename[ 7]; | |||
| if (gotoblas == &gotoblas_PENRYN) return corename[ 8]; | |||
| if (gotoblas == &gotoblas_DUNNINGTON) return corename[ 9]; | |||
| if (gotoblas == &gotoblas_PENRYN) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 8]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_DUNNINGTON) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[ 9]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_NEHALEM) return corename[10]; | |||
| if (gotoblas == &gotoblas_ATHLON) return corename[11]; | |||
| if (gotoblas == &gotoblas_OPTERON_SSE3) return corename[12]; | |||
| if (gotoblas == &gotoblas_OPTERON) return corename[13]; | |||
| if (gotoblas == &gotoblas_OPTERON_SSE3) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[12]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_OPTERON) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[13]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_BARCELONA) return corename[14]; | |||
| if (gotoblas == &gotoblas_NANO) return corename[15]; | |||
| if (gotoblas == &gotoblas_NANO) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[15]; | |||
| #else | |||
| return corename[10]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; | |||
| if (gotoblas == &gotoblas_BOBCAT) return corename[17]; | |||
| if (gotoblas == &gotoblas_BOBCAT) | |||
| #ifdef DYNAMIC_OLDER | |||
| return corename[17]; | |||
| #else | |||
| return corename[7]; | |||
| #endif | |||
| if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; | |||
| if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; | |||
| if (gotoblas == &gotoblas_HASWELL) return corename[20]; | |||
| @@ -787,6 +834,7 @@ char *gotoblas_corename(void) { | |||
| } | |||
| static gotoblas_t *force_coretype(char *coretype){ | |||
| int i ; | |||
| @@ -6,6 +6,9 @@ extern gotoblas_t gotoblas_POWER8; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| extern gotoblas_t gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| extern gotoblas_t gotoblas_POWER10; | |||
| #endif | |||
| extern void openblas_warning(int verbose, const char *msg); | |||
| @@ -13,7 +16,8 @@ static char *corename[] = { | |||
| "unknown", | |||
| "POWER6", | |||
| "POWER8", | |||
| "POWER9" | |||
| "POWER9", | |||
| "POWER10" | |||
| }; | |||
| #define NUM_CORETYPES 4 | |||
| @@ -23,6 +27,9 @@ char *gotoblas_corename(void) { | |||
| if (gotoblas == &gotoblas_POWER8) return corename[2]; | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| if (gotoblas == &gotoblas_POWER10) return corename[4]; | |||
| #endif | |||
| return corename[0]; | |||
| } | |||
| @@ -36,6 +43,10 @@ static gotoblas_t *get_coretype(void) { | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| if (__builtin_cpu_is("power9")) | |||
| return &gotoblas_POWER9; | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| if (__builtin_cpu_is("isa_3_1") && __builtin_cpu_supports ("mma")) | |||
| return &gotoblas_POWER10; | |||
| #endif | |||
| return NULL; | |||
| } | |||
| @@ -61,6 +72,9 @@ static gotoblas_t *force_coretype(char * coretype) { | |||
| case 2: return (&gotoblas_POWER8); | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 6) | |||
| case 3: return (&gotoblas_POWER9); | |||
| #endif | |||
| #if (!defined __GNUC__) || ( __GNUC__ >= 11) | |||
| case 4: return (&gotoblas_POWER10); | |||
| #endif | |||
| default: return NULL; | |||
| } | |||
| @@ -1,12 +1,58 @@ | |||
| #include "common.h" | |||
| #include <stdbool.h> | |||
| // Gate kernels for z13 and z14 on gcc version | |||
| #if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ | |||
| /* RHEL 7 since 7.3: */ \ | |||
| (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ | |||
| __GNUC_RH_RELEASE__ >= 11) | |||
| #define HAVE_Z13_SUPPORT | |||
| #endif | |||
| #if __GNUC__ >= 7 | |||
| #define HAVE_Z14_SUPPORT | |||
| #endif | |||
| // Guard the use of getauxval() on glibc version >= 2.16 | |||
| #ifdef __GLIBC__ | |||
| #include <features.h> | |||
| #if __GLIBC_PREREQ(2, 16) | |||
| #include <sys/auxv.h> | |||
| #define HAVE_GETAUXVAL 1 | |||
| static unsigned long get_hwcap(void) | |||
| { | |||
| unsigned long hwcap = getauxval(AT_HWCAP); | |||
| char *maskenv; | |||
| // honor requests for not using specific CPU features in LD_HWCAP_MASK | |||
| maskenv = getenv("LD_HWCAP_MASK"); | |||
| if (maskenv) | |||
| hwcap &= strtoul(maskenv, NULL, 0); | |||
| return hwcap; | |||
| // note that a missing auxval is interpreted as no capabilities | |||
| // available, which is safe. | |||
| } | |||
| #else // __GLIBC_PREREQ(2, 16) | |||
| #warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" | |||
| static unsigned long get_hwcap(void) { | |||
| // treat missing support for getauxval() as no capabilities available, | |||
| // which is safe. | |||
| return 0; | |||
| } | |||
| #endif // __GLIBC_PREREQ(2, 16) | |||
| #endif // __GLIBC | |||
| extern gotoblas_t gotoblas_ZARCH_GENERIC; | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| extern gotoblas_t gotoblas_Z13; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| extern gotoblas_t gotoblas_Z14; | |||
| //extern gotoblas_t gotoblas_Z15; | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| //extern gotoblas_t gotoblas_Z14; | |||
| //#endif | |||
| #endif | |||
| #define NUM_CORETYPES 4 | |||
| @@ -16,47 +62,50 @@ static char* corename[] = { | |||
| "unknown", | |||
| "Z13", | |||
| "Z14", | |||
| // "Z15", | |||
| "ZARCH_GENERIC", | |||
| }; | |||
| char* gotoblas_corename(void) { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| if (gotoblas == &gotoblas_Z13) return corename[1]; | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| if (gotoblas == &gotoblas_Z14) return corename[2]; | |||
| // if (gotoblas == &gotoblas_Z15) return corename[3]; | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| // if (gotoblas == &gotoblas_POWER9) return corename[3]; | |||
| //#endif | |||
| return corename[0]; // try generic? | |||
| #endif | |||
| if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; | |||
| return corename[0]; | |||
| } | |||
| // __builtin_cpu_is is not supported by zarch | |||
| /** | |||
| * Detect the fitting set of kernels by retrieving the CPU features supported by | |||
| * OS from the auxiliary value AT_HWCAP and choosing the set of kernels | |||
| * ("coretype") that exploits most of the features and can be compiled with the | |||
| * available gcc version. | |||
| * Note that we cannot use vector registers on a z13 or newer unless supported | |||
| * by the OS kernel (which needs to handle them properly during context switch). | |||
| */ | |||
| static gotoblas_t* get_coretype(void) { | |||
| FILE* infile; | |||
| char buffer[512], * p; | |||
| p = (char*)NULL; | |||
| infile = fopen("/proc/sysinfo", "r"); | |||
| while (fgets(buffer, sizeof(buffer), infile)) { | |||
| if (!strncmp("Type", buffer, 4)) { | |||
| p = strchr(buffer, ':') + 2; | |||
| #if 0 | |||
| fprintf(stderr, "%s\n", p); | |||
| #endif | |||
| break; | |||
| } | |||
| } | |||
| fclose(infile); | |||
| unsigned long hwcap __attribute__((unused)) = get_hwcap(); | |||
| if (strstr(p, "2964")) return &gotoblas_Z13; | |||
| if (strstr(p, "2965")) return &gotoblas_Z13; | |||
| if (strstr(p, "3906")) return &gotoblas_Z14; | |||
| if (strstr(p, "3907")) return &gotoblas_Z14; | |||
| if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 | |||
| if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 | |||
| // z14 and z15 systems: exploit Vector Facility (SIMD) and | |||
| // Vector-Enhancements Facility 1 (float SIMD instructions), if present. | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) | |||
| return &gotoblas_Z14; | |||
| #endif | |||
| // z13: Vector Facility (SIMD for double) | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| if (hwcap & HWCAP_S390_VX) | |||
| return &gotoblas_Z13; | |||
| #endif | |||
| return NULL; // should be ZARCH_GENERIC | |||
| // fallback in case of missing compiler support, systems before z13, or | |||
| // when the OS does not advertise support for the Vector Facility (e.g., | |||
| // missing support in the OS kernel) | |||
| return &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| static gotoblas_t* force_coretype(char* coretype) { | |||
| @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { | |||
| switch (found) | |||
| { | |||
| #ifdef HAVE_Z13_SUPPORT | |||
| case 1: return (&gotoblas_Z13); | |||
| #endif | |||
| #ifdef HAVE_Z14_SUPPORT | |||
| case 2: return (&gotoblas_Z14); | |||
| // case 3: return (&gotoblas_Z15); | |||
| //#if (!defined C_GCC) || (GCC_VERSION >= 60000) | |||
| // case 3: return (&gotoblas_POWER9); | |||
| //#endif | |||
| #endif | |||
| case 3: return (&gotoblas_ZARCH_GENERIC); | |||
| default: return NULL; | |||
| } | |||
| snprintf(message, 128, "Core not found: %s\n", coretype); | |||
| @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { | |||
| if (gotoblas == NULL) | |||
| { | |||
| snprintf(coremsg, 128, "Falling back to Z14 core\n"); | |||
| snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); | |||
| openblas_warning(1, coremsg); | |||
| gotoblas = &gotoblas_Z14; | |||
| gotoblas = &gotoblas_ZARCH_GENERIC; | |||
| } | |||
| if (gotoblas && gotoblas->init) { | |||
| @@ -2070,7 +2070,7 @@ if (!release->address) return; | |||
| if (munmap(release -> address, BUFFER_SIZE)) { | |||
| int errsv=errno; | |||
| perror("OpenBLAS : munmap failed:"); | |||
| printf("error code=%d,\trelease->address=%lx\n",errsv,release->address); | |||
| printf("error code=%d,\trelease->address=%p\n",errsv,release->address); | |||
| } | |||
| } | |||
| @@ -62,6 +62,11 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; | |||
| BLASLONG gemm_offset_b = GEMM_OFFSET_B; | |||
| #endif | |||
| #if SHGEMM_P == shgemm_p | |||
| BLASLONG shgemm_p = DEFAULT_GEMM_P; | |||
| #else | |||
| BLASLONG shgemm_p = SHGEMM_P; | |||
| #endif | |||
| #if SGEMM_P == sgemm_p | |||
| BLASLONG sgemm_p = DEFAULT_GEMM_P; | |||
| #else | |||
| @@ -83,6 +88,11 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; | |||
| BLASLONG zgemm_p = ZGEMM_P; | |||
| #endif | |||
| #if SHGEMM_Q == shgemm_q | |||
| BLASLONG shgemm_q = DEFAULT_GEMM_Q; | |||
| #else | |||
| BLASLONG shgemm_q = SHGEMM_Q; | |||
| #endif | |||
| #if SGEMM_Q == sgemm_q | |||
| BLASLONG sgemm_q = DEFAULT_GEMM_Q; | |||
| #else | |||
| @@ -104,6 +114,11 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; | |||
| BLASLONG zgemm_q = ZGEMM_Q; | |||
| #endif | |||
| #if SHGEMM_R == shgemm_r | |||
| BLASLONG shgemm_r = DEFAULT_GEMM_R; | |||
| #else | |||
| BLASLONG shgemm_r = SHGEMM_R; | |||
| #endif | |||
| #if SGEMM_R == sgemm_r | |||
| BLASLONG sgemm_r = DEFAULT_GEMM_R; | |||
| #else | |||
| @@ -597,6 +612,7 @@ void blas_set_parameter(void){ | |||
| size = BITMASK(cpuid3, 16, 0xff); | |||
| shgemm_p = 192 * (size + 1); | |||
| sgemm_p = 192 * (size + 1); | |||
| dgemm_p = 96 * (size + 1); | |||
| cgemm_p = 96 * (size + 1); | |||
| @@ -610,6 +626,7 @@ void blas_set_parameter(void){ | |||
| xgemm_p = 16 * (size + 1); | |||
| #endif | |||
| shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; | |||
| sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; | |||
| dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; | |||
| cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; | |||
| @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED | |||
| BUILD_LAPACK_DEPRECATED = 0 | |||
| endif | |||
| ifndef BUILD_HALF | |||
| BUILD_HALF = 0 | |||
| endif | |||
| ifeq ($(OSNAME), WINNT) | |||
| ifeq ($(F_COMPILER), GFORTRAN) | |||
| ifndef ONLY_CBLAS | |||
| @@ -151,8 +155,12 @@ ifeq ($(F_COMPILER), INTEL) | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else ifeq ($(F_COMPILER), FLANG) | |||
| $(FC) $(FFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| -Wl,-soname,$(INTERNALNAME) $(EXTRALIB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. | |||
| else | |||
| ifneq ($(C_COMPILER), LSB) | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -shared -o ../$(LIBSONAME) \ | |||
| -Wl,--whole-archive $< -Wl,--no-whole-archive \ | |||
| @@ -234,23 +242,23 @@ static : ../$(LIBNAME) | |||
| rm -f goto.$(SUFFIX) | |||
| osx.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| aix.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objcopy.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| objconv.def : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) | |||
| perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) | |||
| test : linktest.c | |||
| $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. | |||
| rm -f linktest | |||
| linktest.c : gensymbol ../Makefile.system ../getarch.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c | |||
| perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c | |||
| clean :: | |||
| @rm -f *.def *.dylib __.SYMDEF* *.renamed | |||
| @@ -40,17 +40,13 @@ | |||
| ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, | |||
| xerbla, | |||
| saxpby,daxpby,caxpby,zaxpby, | |||
| somatcopy, domatcopy, comatcopy, zomatcopy, | |||
| simatcopy, dimatcopy, cimatcopy, zimatcopy, | |||
| sgeadd,dgeadd,cgeadd,zgeadd, | |||
| somatcopy, | |||
| simatcopy, | |||
| domatcopy, | |||
| dimatcopy, | |||
| comatcopy, | |||
| cimatcopy, | |||
| zomatcopy, | |||
| zimatcopy, | |||
| ssum, dsum, scsum, dzsum | |||
| ); | |||
| @halfblasobjs = (shgemm); | |||
| @cblasobjs = ( | |||
| cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, | |||
| cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, | |||
| @@ -80,9 +76,16 @@ | |||
| cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, | |||
| cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, | |||
| cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd | |||
| cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd, | |||
| cblas_isamin, cblas_idamin, cblas_icamin, cblas_izamin, | |||
| cblas_ismin, cblas_idmin, cblas_icmin, cblas_izmin, | |||
| cblas_ismax, cblas_idmax, cblas_icmax, cblas_izmax, | |||
| cblas_ssum, cblas_dsum, cblas_scsum, cblas_dzsum, | |||
| cblas_xerbla | |||
| ); | |||
| @halfcblasobjs = (cblas_shgemm); | |||
| @exblasobjs = ( | |||
| qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, | |||
| qgemv,qger,qmax,qmin, | |||
| @@ -3454,6 +3457,10 @@ use File::Spec; | |||
| use File::Basename; | |||
| my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); | |||
| if ($ARGV[12] == 1) { | |||
| @blasobjs = (@blasobjs, @halfblasobjs); | |||
| @cblasobjs = (@cblasobjs, @halfcblasobjs); | |||
| } | |||
| if ($ARGV[8] == 1) { | |||
| #ONLY_CBLAS=1 | |||
| @underscore_objs = (@misc_underscore_objs); | |||
| @@ -3494,9 +3501,12 @@ if ($ARGV[1] eq "x86") { @underscore_objs = (@underscore_objs, @gemm3mobjs); | |||
| if ($ARGV[1] eq "ia64") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||
| if ($ARGV[1] eq "MIPS") { @underscore_objs = (@underscore_objs, @gemm3mobjs); }; | |||
| if ($ARGV[4] == 0) { | |||
| @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); | |||
| if ($ARGV[1] eq "x86_64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "x86") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "ia64") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| if ($ARGV[1] eq "MIPS") { @no_underscore_objs = (@no_underscore_objs, @cblasgemm3mobjs); }; | |||
| }else{ | |||
| #NO_CBLAS=1 | |||
| @no_underscore_objs = (@misc_no_underscore_objs); | |||
| @@ -334,7 +334,8 @@ if ($link ne "") { | |||
| && ($flags !~ /kernel32/) | |||
| && ($flags !~ /advapi32/) | |||
| && ($flags !~ /shell32/) | |||
| && ($flags !~ /omp/) | |||
| && ($flags !~ /omp/ || ($vendor !~ /PGI/ && $flags =~ /omp/)) | |||
| && ($flags !~ /[0-9]+/) | |||
| && ($flags !~ /^\-l$/) | |||
| ) { | |||
| $linker_l .= $flags . " "; | |||
| @@ -650,6 +650,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME "POWER9" | |||
| #endif | |||
| #if defined(FORCE_POWER10) | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| #define SUBARCHITECTURE "POWER10" | |||
| #define SUBDIRNAME "power" | |||
| #define ARCHCONFIG "-DPOWER10 " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=128 " \ | |||
| "-DL2_SIZE=4194304 -DL2_LINESIZE=128 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "power10" | |||
| #define CORENAME "POWER10" | |||
| #endif | |||
| #ifdef FORCE_PPCG4 | |||
| #define FORCE | |||
| #define ARCHITECTURE "POWER" | |||
| @@ -812,6 +825,34 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_MIPS1004K | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "MIPS1004K" | |||
| #define SUBDIRNAME "mips" | |||
| #define ARCHCONFIG "-DMIPS1004K " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "mips1004K" | |||
| #define CORENAME "MIPS1004K" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_MIPS24K | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| #define SUBARCHITECTURE "MIPS24K" | |||
| #define SUBDIRNAME "mips" | |||
| #define ARCHCONFIG "-DMIPS24K " \ | |||
| "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ | |||
| "-DL2_SIZE=32768 -DL2_LINESIZE=32 " \ | |||
| "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 " | |||
| #define LIBNAME "mips24K" | |||
| #define CORENAME "MIPS24K" | |||
| #else | |||
| #endif | |||
| #ifdef FORCE_I6500 | |||
| #define FORCE | |||
| #define ARCHITECTURE "MIPS" | |||
| @@ -1334,10 +1375,12 @@ int main(int argc, char *argv[]){ | |||
| #if defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); | |||
| #endif | |||
| #if defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 | |||
| #elif defined(__BIG_ENDIAN__) && __BIG_ENDIAN__ > 0 | |||
| printf("__BYTE_ORDER__=__ORDER_BIG_ENDIAN__\n"); | |||
| #endif | |||
| #if defined(_CALL_ELF) && (_CALL_ELF == 2) | |||
| printf("ELF_VERSION=2\n"); | |||
| #endif | |||
| #ifdef MAKE_NB_JOBS | |||
| #if MAKE_NB_JOBS > 0 | |||
| @@ -9,6 +9,8 @@ | |||
| int main(int argc, char **argv) { | |||
| if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { | |||
| printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); | |||
| printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); | |||
| printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); | |||
| printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); | |||
| printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); | |||
| @@ -115,7 +115,7 @@ foreach (float_type ${FLOAT_TYPES}) | |||
| GenerateNamedObjects("syr2k.c" "HEMM" "her2k" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| if (USE_GEMM3M) | |||
| GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("gemm.c" "GEMM3M" "gemm3m" ${CBLAS_FLAG} "" "" false ${float_type}) | |||
| endif() | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX") | |||
| @@ -46,6 +46,9 @@ SBLAS3OBJS = \ | |||
| somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ | |||
| sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLAS3OBJS = shgemm.$(SUFFIX) | |||
| endif | |||
| DBLAS1OBJS = \ | |||
| daxpy.$(SUFFIX) dswap.$(SUFFIX) \ | |||
| @@ -277,6 +280,10 @@ CSBLAS3OBJS = \ | |||
| cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ | |||
| cblas_sgeadd.$(SUFFIX) | |||
| ifeq ($(BUILD_HALF),1) | |||
| CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) | |||
| endif | |||
| CDBLAS1OBJS = \ | |||
| cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ | |||
| cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \ | |||
| @@ -367,6 +374,7 @@ override CFLAGS += -I. | |||
| SBLAS1OBJS += $(CSBLAS1OBJS) | |||
| SBLAS2OBJS += $(CSBLAS2OBJS) | |||
| SBLAS3OBJS += $(CSBLAS3OBJS) | |||
| SHBLAS3OBJS += $(CSHBLAS3OBJS) | |||
| DBLAS1OBJS += $(CDBLAS1OBJS) | |||
| DBLAS2OBJS += $(CDBLAS2OBJS) | |||
| DBLAS3OBJS += $(CDBLAS3OBJS) | |||
| @@ -380,6 +388,7 @@ ZBLAS3OBJS += $(CZBLAS3OBJS) | |||
| endif | |||
| SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) | |||
| SHBLASOBJS = $(SHBLAS3OBJS) | |||
| DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) | |||
| QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) | |||
| CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) | |||
| @@ -454,7 +463,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) | |||
| endif | |||
| FUNCOBJS = $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) | |||
| ifdef EXPRECISION | |||
| FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) | |||
| @@ -488,10 +497,10 @@ level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $ | |||
| level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) | |||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | |||
| level3 : $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) | |||
| level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) | |||
| $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ | |||
| $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ | |||
| $(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ | |||
| $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS | |||
| srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c | |||
| @@ -1209,6 +1218,11 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c | |||
| xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -c $(CFLAGS) $< -o $(@F) | |||
| @@ -1770,6 +1784,11 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c | |||
| cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| ifeq ($(BUILD_HALF),1) | |||
| cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| endif | |||
| cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h | |||
| $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) | |||
| @@ -77,7 +77,7 @@ | |||
| #define GEMM_MULTITHREAD_THRESHOLD 4 | |||
| #endif | |||
| static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { | |||
| static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, IFLOAT *, IFLOAT *, BLASLONG) = { | |||
| #ifndef GEMM3M | |||
| GEMM_NN, GEMM_TN, GEMM_RN, GEMM_CN, | |||
| GEMM_NT, GEMM_TT, GEMM_RT, GEMM_CT, | |||
| @@ -108,8 +108,8 @@ static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLA | |||
| void NAME(char *TRANSA, char *TRANSB, | |||
| blasint *M, blasint *N, blasint *K, | |||
| FLOAT *alpha, | |||
| FLOAT *a, blasint *ldA, | |||
| FLOAT *b, blasint *ldB, | |||
| IFLOAT *a, blasint *ldA, | |||
| IFLOAT *b, blasint *ldB, | |||
| FLOAT *beta, | |||
| FLOAT *c, blasint *ldC){ | |||
| @@ -119,8 +119,8 @@ void NAME(char *TRANSA, char *TRANSB, | |||
| blasint info; | |||
| char transA, transB; | |||
| FLOAT *buffer; | |||
| FLOAT *sa, *sb; | |||
| IFLOAT *buffer; | |||
| IFLOAT *sa, *sb; | |||
| #ifdef SMP | |||
| double MNK; | |||
| @@ -41,6 +41,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| # a bit of metaprogramming here to pull out the appropriate KERNEL var | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) | |||
| if (DEFINED ${float_char}MAXKERNEL) | |||
| @@ -93,6 +96,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| endif () | |||
| if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GERCKERNEL}" "CONJ" "gerc_k" false "" "" false ${float_type}) | |||
| @@ -124,17 +130,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| if (ARM OR ARM64 OR (TARGET_CORE MATCHES LONGSOON3B) OR (TARGET_CORE MATCHES GENERIC) OR (TARGET_CORE MATCHES HASWELL) OR (TARGET_CORE MATCHES ZEN) OR (TARGET_CORE MATCHES SKYLAKEX) ) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9)) | |||
| if (ZARCH OR (TARGET_CORE MATCHES POWER8) OR (TARGET_CORE MATCHES POWER9) OR (TARGET_CORE MATCHES POWER10)) | |||
| set(USE_TRMM true) | |||
| endif () | |||
| foreach (float_type SINGLE DOUBLE) | |||
| foreach (float_type SINGLE DOUBLE HALF) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| if (NOT ${BUILD_HALF}) | |||
| continue () | |||
| else () | |||
| set (float_char "SH") | |||
| endif () | |||
| endif () | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) | |||
| endforeach() | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| endif () | |||
| if (${float_char}GEMMINCOPY) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) | |||
| endif () | |||
| @@ -470,9 +486,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| GenerateNamedObjects("${KERNELDIR}/${${float_char}GEADD_KERNEL}" "" "geadd_k" false "" "" false ${float_type}) | |||
| endforeach () | |||
| # Makefile.LA | |||
| if(NOT NO_LAPACK) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| endif () | |||
| if (NOT DEFINED ${float_char}NEG_TCOPY) | |||
| if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") | |||
| set(${float_char}NEG_TCOPY ../generic/zneg_tcopy.c) | |||
| @@ -516,6 +536,9 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) | |||
| foreach (float_type ${FLOAT_TYPES}) | |||
| # a bit of metaprogramming here to pull out the appropriate KERNEL var | |||
| string(SUBSTRING ${float_type} 0 1 float_char) | |||
| if (${float_type} STREQUAL "HALF") | |||
| set (float_char "SH") | |||
| endif () | |||
| GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) | |||
| GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) | |||
| endforeach () | |||
| @@ -51,6 +51,10 @@ ifeq ($(CORE), POWER9) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(CORE), POWER10) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(ARCH), zarch) | |||
| USE_TRMM = 1 | |||
| endif | |||
| @@ -59,6 +63,25 @@ ifeq ($(CORE), Z14) | |||
| USE_TRMM = 1 | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| ifndef SHGEMMKERNEL | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = ../generic/gemmkernel_2x2.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_2.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SHKERNELOBJS += \ | |||
| shgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ | |||
| $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) | |||
| endif | |||
| SKERNELOBJS += \ | |||
| sgemm_kernel$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -93,6 +116,9 @@ XKERNELOBJS += \ | |||
| $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ | |||
| $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += $(SHKERNELOBJS) | |||
| endif | |||
| SBLASOBJS += $(SKERNELOBJS) | |||
| DBLASOBJS += $(DKERNELOBJS) | |||
| QBLASOBJS += $(QKERNELOBJS) | |||
| @@ -100,6 +126,10 @@ CBLASOBJS += $(CKERNELOBJS) | |||
| ZBLASOBJS += $(ZKERNELOBJS) | |||
| XBLASOBJS += $(XKERNELOBJS) | |||
| ifeq ($(BUILD_HALF),1) | |||
| SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SBLASOBJS += \ | |||
| sgemm_beta$(TSUFFIX).$(SUFFIX) \ | |||
| strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ | |||
| @@ -389,6 +419,12 @@ ZBLASOBJS += \ | |||
| zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ | |||
| zgeadd_k$(TSUFFIX).$(SUFFIX) | |||
| ifeq ($(BUILD_HALF), 1) | |||
| SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| endif | |||
| SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| @@ -415,6 +451,11 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -433,12 +474,47 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s | |||
| m4 shgemmotcopy.s > shgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ | |||
| rm shgemmotcopy.s shgemmotcopy_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| $(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s | |||
| m4 shgemmitcopy.s > shgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ | |||
| rm shgemmitcopy.s shgemmitcopy_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| endif | |||
| $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(KDIR)$(SGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SGEMMOTCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmotcopy.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmotcopy.s | |||
| m4 sgemmotcopy.s > sgemmotcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmotcopy_nomacros.s -o $@ | |||
| rm sgemmotcopy.s sgemmotcopy_nomacros.s | |||
| @@ -454,7 +530,7 @@ $(KDIR)$(SGEMMINCOPYOBJ) : $(KERNELDIR)/$(SGEMMINCOPY) | |||
| $(KDIR)$(SGEMMITCOPYOBJ) : $(KERNELDIR)/$(SGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemmitcopy.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemmitcopy.s | |||
| m4 sgemmitcopy.s > sgemmitcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemmitcopy_nomacros.s -o $@ | |||
| rm sgemmitcopy.s sgemmitcopy_nomacros.s | |||
| @@ -466,7 +542,7 @@ endif | |||
| $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_ncopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_ncopy.s | |||
| m4 dgemm_ncopy.s > dgemm_ncopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_ncopy_nomacros.s -o $@ | |||
| rm dgemm_ncopy.s dgemm_ncopy_nomacros.s | |||
| @@ -484,7 +560,7 @@ $(KDIR)$(DGEMMINCOPYOBJ) : $(KERNELDIR)/$(DGEMMINCOPY) | |||
| $(KDIR)$(DGEMMITCOPYOBJ) : $(KERNELDIR)/$(DGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_itcopy.s | |||
| m4 dgemm_itcopy.s > dgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_itcopy_nomacros.s -o $@ | |||
| rm dgemm_itcopy.s dgemm_itcopy_nomacros.s | |||
| @@ -527,7 +603,7 @@ $(KDIR)$(CGEMMINCOPYOBJ) : $(KERNELDIR)/$(CGEMMINCOPY) | |||
| $(KDIR)$(CGEMMITCOPYOBJ) : $(KERNELDIR)/$(CGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -E $< -o cgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > cgemm_itcopy.s | |||
| m4 cgemm_itcopy.s > cgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX cgemm_itcopy_nomacros.s -o $@ | |||
| rm cgemm_itcopy.s cgemm_itcopy_nomacros.s | |||
| @@ -550,7 +626,7 @@ $(KDIR)$(ZGEMMINCOPYOBJ) : $(KERNELDIR)/$(ZGEMMINCOPY) | |||
| $(KDIR)$(ZGEMMITCOPYOBJ) : $(KERNELDIR)/$(ZGEMMITCOPY) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o zgemm_itcopy.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zgemm_itcopy.s | |||
| m4 zgemm_itcopy.s > zgemm_itcopy_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zgemm_itcopy_nomacros.s -o $@ | |||
| rm zgemm_itcopy.s zgemm_itcopy_nomacros.s | |||
| @@ -582,7 +658,7 @@ endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -UCOMPLEX $< -o sgemm_kernel$(TSUFFIX).s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > sgemm_kernel$(TSUFFIX).s | |||
| m4 sgemm_kernel$(TSUFFIX).s > sgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX sgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm sgemm_kernel$(TSUFFIX).s sgemm_kernel$(TSUFFIX)_nomacros.s | |||
| @@ -590,9 +666,22 @@ else | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s | |||
| m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s | |||
| else | |||
| $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -UCOMPLEX $< -o dgemm_kernel$(TSUFFIX).s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dgemm_kernel$(TSUFFIX).s | |||
| m4 dgemm_kernel$(TSUFFIX).s > dgemm_kernel$(TSUFFIX)_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dgemm_kernel$(TSUFFIX)_nomacros.s -o $@ | |||
| rm dgemm_kernel$(TSUFFIX).s dgemm_kernel$(TSUFFIX)_nomacros.s | |||
| @@ -605,7 +694,7 @@ $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEP | |||
| $(KDIR)cgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNN $< -o cgemm_kernel_n.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > cgemm_kernel_n.s | |||
| m4 cgemm_kernel_n.s > cgemm_kernel_n_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN cgemm_kernel_n_nomacros.s -o $@ | |||
| rm cgemm_kernel_n.s cgemm_kernel_n_nomacros.s | |||
| @@ -615,7 +704,7 @@ endif | |||
| $(KDIR)cgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCN $< -o cgemm_kernel_l.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > cgemm_kernel_l.s | |||
| m4 cgemm_kernel_l.s > cgemm_kernel_l_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN cgemm_kernel_l_nomacros.s -o $@ | |||
| rm cgemm_kernel_l.s cgemm_kernel_l_nomacros.s | |||
| @@ -625,7 +714,7 @@ endif | |||
| $(KDIR)cgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s | |||
| m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ | |||
| rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s | |||
| @@ -635,7 +724,7 @@ endif | |||
| $(KDIR)cgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -UDOUBLE -DCOMPLEX -DCC $< -o cgemm_kernel_b.s | |||
| $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > cgemm_kernel_b.s | |||
| m4 cgemm_kernel_b.s > cgemm_kernel_b_nomacros.s | |||
| $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC cgemm_kernel_b_nomacros.s -o $@ | |||
| rm cgemm_kernel_b.s cgemm_kernel_b_nomacros.s | |||
| @@ -645,7 +734,7 @@ endif | |||
| $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNN $< -o zgemm_kernel_n.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s | |||
| m4 zgemm_kernel_n.s > zgemm_kernel_n_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zgemm_kernel_n_nomacros.s -o $@ | |||
| rm zgemm_kernel_n.s zgemm_kernel_n_nomacros.s | |||
| @@ -655,7 +744,7 @@ endif | |||
| $(KDIR)zgemm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCN $< -o zgemm_kernel_l.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zgemm_kernel_l.s | |||
| m4 zgemm_kernel_l.s > zgemm_kernel_l_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zgemm_kernel_l_nomacros.s -o $@ | |||
| rm zgemm_kernel_l.s zgemm_kernel_l_nomacros.s | |||
| @@ -665,7 +754,7 @@ endif | |||
| $(KDIR)zgemm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DNC $< -o zgemm_kernel_r.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zgemm_kernel_r.s | |||
| m4 zgemm_kernel_r.s > zgemm_kernel_r_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zgemm_kernel_r_nomacros.s -o $@ | |||
| rm zgemm_kernel_r.s zgemm_kernel_r_nomacros.s | |||
| @@ -675,7 +764,7 @@ endif | |||
| $(KDIR)zgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DDOUBLE -DCOMPLEX -DCC $< -o zgemm_kernel_b.s | |||
| $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zgemm_kernel_b.s | |||
| m4 zgemm_kernel_b.s > zgemm_kernel_b_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zgemm_kernel_b_nomacros.s -o $@ | |||
| rm zgemm_kernel_b.s zgemm_kernel_b_nomacros.s | |||
| @@ -699,7 +788,7 @@ $(KDIR)xgemm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMD | |||
| ifdef USE_TRMM | |||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o strmmkernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > strmmkernel_ln.s | |||
| m4 strmmkernel_ln.s > strmmkernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA strmmkernel_ln_nomacros.s -o $@ | |||
| rm strmmkernel_ln.s strmmkernel_ln_nomacros.s | |||
| @@ -709,7 +798,7 @@ endif | |||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o strmmkernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > strmmkernel_lt.s | |||
| m4 strmmkernel_lt.s > strmmkernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA strmmkernel_lt_nomacros.s -o $@ | |||
| rm strmmkernel_lt.s strmmkernel_lt_nomacros.s | |||
| @@ -719,7 +808,7 @@ endif | |||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o strmmkernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > strmmkernel_rn.s | |||
| m4 strmmkernel_rn.s > strmmkernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA strmmkernel_rn_nomacros.s -o $@ | |||
| rm strmmkernel_rn.s strmmkernel_rn_nomacros.s | |||
| @@ -729,7 +818,7 @@ endif | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -739,7 +828,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o dtrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o - > dtrmm_kernel_ln.s | |||
| m4 dtrmm_kernel_ln.s > dtrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA dtrmm_kernel_ln_nomacros.s -o $@ | |||
| rm dtrmm_kernel_ln.s dtrmm_kernel_ln_nomacros.s | |||
| @@ -749,7 +838,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o dtrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o - > dtrmm_kernel_lt.s | |||
| m4 dtrmm_kernel_lt.s > dtrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA dtrmm_kernel_lt_nomacros.s -o $@ | |||
| rm dtrmm_kernel_lt.s dtrmm_kernel_lt_nomacros.s | |||
| @@ -759,7 +848,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o dtrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o - > dtrmm_kernel_rn.s | |||
| m4 dtrmm_kernel_rn.s > dtrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA dtrmm_kernel_rn_nomacros.s -o $@ | |||
| rm dtrmm_kernel_rn.s dtrmm_kernel_rn_nomacros.s | |||
| @@ -769,7 +858,7 @@ endif | |||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o dtrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > dtrmm_kernel_rt.s | |||
| m4 dtrmm_kernel_rt.s > dtrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA dtrmm_kernel_rt_nomacros.s -o $@ | |||
| rm dtrmm_kernel_rt.s dtrmm_kernel_rt_nomacros.s | |||
| @@ -791,7 +880,7 @@ $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_ln.s | |||
| m4 ctrmm_kernel_ln.s > ctrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_ln_nomacros.s -o $@ | |||
| rm ctrmm_kernel_ln.s ctrmm_kernel_ln_nomacros.s | |||
| @@ -801,7 +890,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_lt.s | |||
| m4 ctrmm_kernel_lt.s > ctrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_lt_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lt.s ctrmm_kernel_lt_nomacros.s | |||
| @@ -811,7 +900,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lr.s | |||
| m4 ctrmm_kernel_lr.s > ctrmm_kernel_lr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ctrmm_kernel_lr_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lr.s ctrmm_kernel_lr_nomacros.s | |||
| @@ -821,7 +910,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ctrmm_kernel_lc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ctrmm_kernel_lc.s | |||
| m4 ctrmm_kernel_lc.s > ctrmm_kernel_lc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ctrmm_kernel_lc_nomacros.s -o $@ | |||
| rm ctrmm_kernel_lc_nomacros.s ctrmm_kernel_lc.s | |||
| @@ -831,7 +920,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rn.s | |||
| m4 ctrmm_kernel_rn.s > ctrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ctrmm_kernel_rn_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rn.s ctrmm_kernel_rn_nomacros.s | |||
| @@ -841,7 +930,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ctrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ctrmm_kernel_rt.s | |||
| m4 ctrmm_kernel_rt.s > ctrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ctrmm_kernel_rt_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rt.s ctrmm_kernel_rt_nomacros.s | |||
| @@ -851,7 +940,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ctrmm_kernel_rr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_rr.s | |||
| m4 ctrmm_kernel_rr.s > ctrmm_kernel_rr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ctrmm_kernel_rr_nomacros.s -o $@ | |||
| rm ctrmm_kernel_rr.s ctrmm_kernel_rr_nomacros.s | |||
| @@ -861,7 +950,7 @@ endif | |||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ctrmm_kernel_RC.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ctrmm_kernel_RC.s | |||
| m4 ctrmm_kernel_RC.s > ctrmm_kernel_RC_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ctrmm_kernel_RC_nomacros.s -o $@ | |||
| rm ctrmm_kernel_RC.s ctrmm_kernel_RC_nomacros.s | |||
| @@ -871,7 +960,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_ln.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_ln.s | |||
| m4 ztrmm_kernel_ln.s > ztrmm_kernel_ln_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_ln_nomacros.s -o $@ | |||
| rm ztrmm_kernel_ln.s ztrmm_kernel_ln_nomacros.s | |||
| @@ -881,7 +970,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_lt.s | |||
| m4 ztrmm_kernel_lt.s > ztrmm_kernel_lt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_lt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lt.s ztrmm_kernel_lt_nomacros.s | |||
| @@ -891,7 +980,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lr.s | |||
| m4 ztrmm_kernel_lr.s > ztrmm_kernel_lr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN ztrmm_kernel_lr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lr.s ztrmm_kernel_lr_nomacros.s | |||
| @@ -901,7 +990,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o ztrmm_kernel_lc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o - > ztrmm_kernel_lc.s | |||
| m4 ztrmm_kernel_lc.s >ztrmm_kernel_lc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN ztrmm_kernel_lc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_lc.s ztrmm_kernel_lc_nomacros.s | |||
| @@ -911,7 +1000,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rn.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rn.s | |||
| m4 ztrmm_kernel_rn.s > ztrmm_kernel_rn_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN ztrmm_kernel_rn_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rn.s ztrmm_kernel_rn_nomacros.s | |||
| @@ -921,7 +1010,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o ztrmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o - > ztrmm_kernel_rt.s | |||
| m4 ztrmm_kernel_rt.s > ztrmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN ztrmm_kernel_rt_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rt.s ztrmm_kernel_rt_nomacros.s | |||
| @@ -931,7 +1020,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rr.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rr.s | |||
| m4 ztrmm_kernel_rr.s > ztrmm_kernel_rr_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC ztrmm_kernel_rr_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rr.s ztrmm_kernel_rr_nomacros.s | |||
| @@ -941,7 +1030,7 @@ endif | |||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o ztrmm_kernel_rc.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o - > ztrmm_kernel_rc.s | |||
| m4 ztrmm_kernel_rc.s > ztrmm_kernel_rc_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC ztrmm_kernel_rc_nomacros.s -o $@ | |||
| rm ztrmm_kernel_rc.s ztrmm_kernel_rc_nomacros.s | |||
| @@ -961,7 +1050,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -1095,7 +1184,7 @@ $(KDIR)dtrsm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LN) $(DT | |||
| $(KDIR)dtrsm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRSMKERNEL_LT) $(DTRSMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o dtrsm_kernel_lt.s | |||
| $(CC) $(CFLAGS) -S -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ $< -o - > dtrsm_kernel_lt.s | |||
| m4 dtrsm_kernel_lt.s > dtrsm_kernel_lt_nomacros.s | |||
| $(CC) -c $(CFLAGS) -DTRSMKERNEL -UCOMPLEX -DDOUBLE -UUPPER -DLT -UCONJ dtrsm_kernel_lt_nomacros.s -o $@ | |||
| rm dtrsm_kernel_lt.s dtrsm_kernel_lt_nomacros.s | |||
| @@ -2206,6 +2295,11 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ | |||
| $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF),1) | |||
| $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2221,6 +2315,24 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) | |||
| $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) | |||
| $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) | |||
| $(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| endif | |||
| $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2325,6 +2437,12 @@ endif | |||
| endif | |||
| ifeq ($(BUILD_HALF), 1) | |||
| $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ | |||
| endif | |||
| $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ | |||
| @@ -2342,7 +2460,7 @@ $(KDIR)cgemm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMM | |||
| $(KDIR)cgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(PFLAGS) -E -UDOUBLE -DCOMPLEX -DNC $< -o cgemm_kernel_r.s | |||
| $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > cgemm_kernel_r.s | |||
| m4 cgemm_kernel_r.s > cgemm_kernel_r_nomacros.s | |||
| $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC cgemm_kernel_r_nomacros.s -o $@ | |||
| rm cgemm_kernel_r.s cgemm_kernel_r_nomacros.s | |||
| @@ -2388,7 +2506,7 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| ifeq ($(OS), AIX) | |||
| $(CC) $(CFLAGS) -E -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o strmm_kernel_rt.s | |||
| $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s | |||
| m4 strmmkernel_rn.s > strmm_kernel_rt_nomacros.s | |||
| $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA strmm_kernel_rt_nomacros.s -o $@ | |||
| rm strmm_kernel_rt.s strmm_kernel_rt_nomacros.s | |||
| @@ -1,3 +1,187 @@ | |||
| include $(KERNELDIR)/KERNEL.ARMV8 | |||
| SAMINKERNEL = ../arm/amin.c | |||
| DAMINKERNEL = ../arm/amin.c | |||
| CAMINKERNEL = ../arm/zamin.c | |||
| ZAMINKERNEL = ../arm/zamin.c | |||
| SMAXKERNEL = ../arm/max.c | |||
| DMAXKERNEL = ../arm/max.c | |||
| SMINKERNEL = ../arm/min.c | |||
| DMINKERNEL = ../arm/min.c | |||
| ISAMINKERNEL = ../arm/iamin.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| ICAMINKERNEL = ../arm/izamin.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| ISMAXKERNEL = ../arm/imax.c | |||
| IDMAXKERNEL = ../arm/imax.c | |||
| ISMINKERNEL = ../arm/imin.c | |||
| IDMINKERNEL = ../arm/imin.c | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| SAMAXKERNEL = amax.S | |||
| DAMAXKERNEL = amax.S | |||
| CAMAXKERNEL = zamax.S | |||
| ZAMAXKERNEL = zamax.S | |||
| SAXPYKERNEL = axpy.S | |||
| DAXPYKERNEL = axpy.S | |||
| CAXPYKERNEL = zaxpy.S | |||
| ZAXPYKERNEL = zaxpy.S | |||
| SROTKERNEL = rot.S | |||
| DROTKERNEL = rot.S | |||
| CROTKERNEL = zrot.S | |||
| ZROTKERNEL = zrot.S | |||
| SSCALKERNEL = scal.S | |||
| DSCALKERNEL = scal.S | |||
| CSCALKERNEL = zscal.S | |||
| ZSCALKERNEL = zscal.S | |||
| SGEMVNKERNEL = gemv_n.S | |||
| DGEMVNKERNEL = gemv_n.S | |||
| CGEMVNKERNEL = zgemv_n.S | |||
| ZGEMVNKERNEL = zgemv_n.S | |||
| SGEMVTKERNEL = gemv_t.S | |||
| DGEMVTKERNEL = gemv_t.S | |||
| CGEMVTKERNEL = zgemv_t.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| SASUMKERNEL = asum.S | |||
| DASUMKERNEL = asum.S | |||
| CASUMKERNEL = casum.S | |||
| ZASUMKERNEL = zasum.S | |||
| SCOPYKERNEL = copy.S | |||
| DCOPYKERNEL = copy.S | |||
| CCOPYKERNEL = copy.S | |||
| ZCOPYKERNEL = copy.S | |||
| SSWAPKERNEL = swap.S | |||
| DSWAPKERNEL = swap.S | |||
| CSWAPKERNEL = swap.S | |||
| ZSWAPKERNEL = swap.S | |||
| ISAMAXKERNEL = iamax.S | |||
| IDAMAXKERNEL = iamax.S | |||
| ICAMAXKERNEL = izamax.S | |||
| IZAMAXKERNEL = izamax.S | |||
| SNRM2KERNEL = nrm2.S | |||
| DNRM2KERNEL = nrm2.S | |||
| CNRM2KERNEL = znrm2.S | |||
| ZNRM2KERNEL = znrm2.S | |||
| DDOTKERNEL = dot.S | |||
| SDOTKERNEL = dot.S | |||
| CDOTKERNEL = zdot.S | |||
| ZDOTKERNEL = zdot.S | |||
| DSDOTKERNEL = dot.S | |||
| DGEMM_BETA = dgemm_beta.S | |||
| SGEMM_BETA = sgemm_beta.S | |||
| ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8) | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S | |||
| else | |||
| SGEMMKERNEL = sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| STRMMKERNEL = strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S | |||
| endif | |||
| ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N)) | |||
| ifeq ($(SGEMM_UNROLL_M), 16) | |||
| SGEMMITCOPY = sgemm_tcopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| ifeq ($(SGEMM_UNROLL_M), 4) | |||
| SGEMMINCOPY = sgemm_ncopy_$(SGEMM_UNROLL_M).S | |||
| else | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c | |||
| endif | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| DTRMMKERNEL = dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S | |||
| ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N)) | |||
| ifeq ($(DGEMM_UNROLL_M), 8) | |||
| DGEMMINCOPY = dgemm_ncopy_$(DGEMM_UNROLL_M).S | |||
| DGEMMITCOPY = dgemm_tcopy_$(DGEMM_UNROLL_M).S | |||
| else | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c | |||
| DGEMMITCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c | |||
| endif | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ifeq ($(DGEMM_UNROLL_N), 4) | |||
| DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S | |||
| DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S | |||
| else | |||
| DGEMMONCOPY = ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c | |||
| endif | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| CTRMMKERNEL = ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S | |||
| ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N)) | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ZTRMMKERNEL = ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S | |||
| ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N)) | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMITCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| endif | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -98,11 +98,58 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| add X, X, #128 | |||
| .endm | |||
| /* | |||
| * No need to do software prefetches if the vector fits | |||
| * into L1 cache | |||
| */ | |||
| .macro KERNEL_F16_L1CACHE | |||
| ldp q4, q5, [X] | |||
| ldp q16, q17, [Y] | |||
| ldp q6, q7, [X, #32] | |||
| ldp q18, q19, [Y, #32] | |||
| fmla v16.2d, v4.2d, v0.d[0] | |||
| fmla v17.2d, v5.2d, v0.d[0] | |||
| stp q16, q17, [Y] | |||
| ldp q20, q21, [X, #64] | |||
| ldp q24, q25, [Y, #64] | |||
| fmla v18.2d, v6.2d, v0.d[0] | |||
| fmla v19.2d, v7.2d, v0.d[0] | |||
| stp q18, q19, [Y, #32] | |||
| ldp q22, q23, [X, #96] | |||
| ldp q26, q27, [Y, #96] | |||
| fmla v24.2d, v20.2d, v0.d[0] | |||
| fmla v25.2d, v21.2d, v0.d[0] | |||
| stp q24, q25, [Y, #64] | |||
| fmla v26.2d, v22.2d, v0.d[0] | |||
| fmla v27.2d, v23.2d, v0.d[0] | |||
| stp q26, q27, [Y, #96] | |||
| add Y, Y, #128 | |||
| add X, X, #128 | |||
| .endm | |||
| .macro KERNEL_F32 | |||
| KERNEL_F16 | |||
| KERNEL_F16 | |||
| .endm | |||
| .macro KERNEL_F32_L1CACHE | |||
| KERNEL_F16_L1CACHE | |||
| KERNEL_F16_L1CACHE | |||
| .endm | |||
| .macro INIT_S | |||
| lsl INC_X, INC_X, #3 | |||
| lsl INC_Y, INC_Y, #3 | |||
| @@ -138,6 +185,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| cmp I, xzr | |||
| beq .Ldaxpy_kernel_F1 | |||
| cmp N, #2048 | |||
| ble .Ldaxpy_kernel_F32_L1CACHE | |||
| .align 5 | |||
| .Ldaxpy_kernel_F32: | |||
| @@ -145,6 +195,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| subs I, I, #1 | |||
| bne .Ldaxpy_kernel_F32 | |||
| b .Ldaxpy_kernel_F1 | |||
| .align 5 | |||
| .Ldaxpy_kernel_F32_L1CACHE: | |||
| KERNEL_F32_L1CACHE | |||
| subs I, I, #1 | |||
| bne .Ldaxpy_kernel_F32_L1CACHE | |||
| .Ldaxpy_kernel_F1: | |||
| @@ -0,0 +1,562 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A00 x2 | |||
| #define LDA x3 | |||
| #define B00 x4 | |||
| #define A01 x5 | |||
| #define A02 x6 | |||
| #define A03 x7 | |||
| #define A04 x8 | |||
| #define A05 x9 | |||
| #define A06 x10 | |||
| #define A07 x11 | |||
| #define A08 x12 | |||
| #define I x13 | |||
| #define J x14 | |||
| #define K x15 | |||
| #define TEMP1 x16 | |||
| #define TEMP2 x17 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| .macro COPY4x8 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v10.s[0], v0.s[1] | |||
| ins v12.s[0], v0.s[2] | |||
| ins v14.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v10.s[1], v1.s[1] | |||
| ins v12.s[1], v1.s[2] | |||
| ins v14.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v10.s[2], v2.s[1] | |||
| ins v12.s[2], v2.s[2] | |||
| ins v14.s[2], v2.s[3] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v10.s[3], v3.s[1] | |||
| ins v12.s[3], v3.s[2] | |||
| ins v14.s[3], v3.s[3] | |||
| ldr q4, [A05], #16 | |||
| ldr q5, [A06], #16 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v11.s[0], v4.s[1] | |||
| ins v13.s[0], v4.s[2] | |||
| ins v15.s[0], v4.s[3] | |||
| ins v9.s[1], v5.s[0] | |||
| ins v11.s[1], v5.s[1] | |||
| ins v13.s[1], v5.s[2] | |||
| ins v15.s[1], v5.s[3] | |||
| ldr q6, [A07], #16 | |||
| ldr q7, [A08], #16 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v11.s[2], v6.s[1] | |||
| ins v13.s[2], v6.s[2] | |||
| ins v15.s[2], v6.s[3] | |||
| ins v9.s[3], v7.s[0] | |||
| ins v11.s[3], v7.s[1] | |||
| ins v13.s[3], v7.s[2] | |||
| ins v15.s[3], v7.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v10.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v10.s[1], v1.s[1] | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v10.s[2], v2.s[1] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v10.s[3], v3.s[1] | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v11.s[0], v4.s[1] | |||
| ins v9.s[1], v5.s[0] | |||
| ins v11.s[1], v5.s[1] | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v11.s[2], v6.s[1] | |||
| ins v9.s[3], v7.s[0] | |||
| ins v11.s[3], v7.s[1] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY1x8 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v8.s[3], v3.s[0] | |||
| ldr s4, [A05], #4 | |||
| ldr s5, [A06], #4 | |||
| ins v9.s[0], v4.s[0] | |||
| ins v9.s[1], v5.s[0] | |||
| ldr s6, [A07], #4 | |||
| ldr s7, [A08], #4 | |||
| ins v9.s[2], v6.s[0] | |||
| ins v9.s[3], v7.s[0] | |||
| st1 {v8.4s, v9.4s}, [B00], #32 | |||
| .endm | |||
| .macro COPY4x4 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| ldr q2, [A03], #16 | |||
| ldr q3, [A04], #16 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v10.s[2], v2.s[2] | |||
| ins v11.s[2], v2.s[3] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| ins v10.s[3], v3.s[2] | |||
| ins v11.s[3], v3.s[3] | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [B00], #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ldr d2, [A03], #8 | |||
| ldr d3, [A04], #8 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v9.s[2], v2.s[1] | |||
| ins v8.s[3], v3.s[0] | |||
| ins v9.s[3], v3.s[1] | |||
| st1 {v8.4s, v9.4s}, [B00], #32 | |||
| .endm | |||
| .macro COPY1x4 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| ldr s2, [A03], #4 | |||
| ldr s3, [A04], #4 | |||
| ins v8.s[2], v2.s[0] | |||
| ins v8.s[3], v3.s[0] | |||
| st1 {v8.4s}, [B00], #16 | |||
| .endm | |||
| .macro COPY4x2 | |||
| ldr q0, [A01], #16 | |||
| ldr q1, [A02], #16 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v10.s[0], v0.s[2] | |||
| ins v11.s[0], v0.s[3] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| ins v10.s[1], v1.s[2] | |||
| ins v11.s[1], v1.s[3] | |||
| st1 {v8.2s, v9.2s, v10.2s, v11.2s}, [B00], #32 | |||
| .endm | |||
| .macro COPY2x2 | |||
| ldr d0, [A01], #8 | |||
| ldr d1, [A02], #8 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v9.s[0], v0.s[1] | |||
| ins v8.s[1], v1.s[0] | |||
| ins v9.s[1], v1.s[1] | |||
| st1 {v8.2s, v9.2s}, [B00], #16 | |||
| .endm | |||
| .macro COPY1x2 | |||
| ldr s0, [A01], #4 | |||
| ldr s1, [A02], #4 | |||
| ins v8.s[0], v0.s[0] | |||
| ins v8.s[1], v1.s[0] | |||
| st1 {v8.2s}, [B00], #8 | |||
| .endm | |||
| .macro COPY1x1 | |||
| ldr s0, [A01], #4 | |||
| str s0, [B00], #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| .Lsgemm_ncopy_L8_BEGIN: | |||
| asr J, N, #3 // J = N / 8 | |||
| cmp J, #0 | |||
| ble .Lsgemm_ncopy_L4_BEGIN | |||
| .align 5 | |||
| .Lsgemm_ncopy_L8_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A00, A08, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_40 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A01 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_1: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_1 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A02 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_2: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_2 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A03 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_3: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_3 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A04 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_4: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_4 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A05 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_5: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_5 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A06 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_6: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_6 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A07 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_7: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_7 | |||
| asr K, M, #4 // K = M / 16(cacheline) | |||
| mov TEMP1, A08 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_warnup_8: | |||
| ldr s0, [TEMP1], #64 | |||
| subs K, K, #1 | |||
| bgt .Lsgemm_tcopy_L8_warnup_8 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L8_M4_20: | |||
| COPY4x8 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L8_M4_20 | |||
| .Lsgemm_ncopy_L8_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_60 | |||
| COPY2x8 | |||
| .Lsgemm_ncopy_L8_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L8_M4_END | |||
| COPY1x8 | |||
| .Lsgemm_ncopy_L8_M4_END: | |||
| subs J , J, #1 // j-- | |||
| bne .Lsgemm_ncopy_L8_M4_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_ncopy_L4_BEGIN: | |||
| tst N, #7 | |||
| ble .Lsgemm_ncopy_L999 | |||
| tst N, #4 | |||
| ble .Lsgemm_ncopy_L2_BEGIN | |||
| .Lsgemm_ncopy_L4_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A00, A04, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_40 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L4_M4_20: | |||
| COPY4x4 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L4_M4_20 | |||
| .Lsgemm_ncopy_L4_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_60 | |||
| COPY2x4 | |||
| .Lsgemm_ncopy_L4_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L4_M4_END | |||
| COPY1x4 | |||
| .Lsgemm_ncopy_L4_M4_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_ncopy_L2_BEGIN: | |||
| tst N, #2 | |||
| ble .Lsgemm_ncopy_L1_BEGIN | |||
| .Lsgemm_ncopy_L2_M4_BEGIN: | |||
| mov A01, A00 | |||
| add A02, A01, LDA | |||
| add A00, A02, LDA | |||
| asr I, M, #2 // I = M / 4 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_40 | |||
| .align 5 | |||
| .Lsgemm_ncopy_L2_M4_20: | |||
| COPY4x2 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_ncopy_L2_M4_20 | |||
| .Lsgemm_ncopy_L2_M4_40: | |||
| and I, M, #2 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_60 | |||
| COPY2x2 | |||
| .Lsgemm_ncopy_L2_M4_60: | |||
| and I, M, #1 | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L2_M4_END | |||
| COPY1x2 | |||
| .Lsgemm_ncopy_L2_M4_END: | |||
| .Lsgemm_ncopy_L1_BEGIN: | |||
| tst N, #1 | |||
| ble .Lsgemm_ncopy_L999 | |||
| .Lsgemm_ncopy_L1_M1_BEGIN: | |||
| mov A01, A00 | |||
| mov I, M | |||
| cmp I, #0 | |||
| ble .Lsgemm_ncopy_L1_M1_END | |||
| .align 5 | |||
| .Lsgemm_ncopy_L1_M1_20: | |||
| COPY1x1 | |||
| subs I, I, #1 | |||
| bne .Lsgemm_ncopy_L1_M1_20 | |||
| .Lsgemm_ncopy_L1_M1_END: | |||
| .Lsgemm_ncopy_L999: | |||
| mov x0, #0 | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -0,0 +1,707 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2016, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A00 PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #define M x0 | |||
| #define N x1 | |||
| #define A x2 | |||
| #define LDA x3 | |||
| #define B x4 | |||
| #define M8 x5 | |||
| #define A01 x6 | |||
| #define A02 x7 | |||
| #define A03 x8 | |||
| #define A04 x9 | |||
| #define A05 x10 | |||
| #define A06 x11 | |||
| #define A07 x12 | |||
| #define A08 x13 | |||
| #define B01 x14 | |||
| #define B02 x15 | |||
| #define B03 x16 | |||
| #define B04 x17 | |||
| #define B00 x22 | |||
| #define I x18 | |||
| #define J x19 | |||
| #define TEMP1 x20 | |||
| #define A_PREFETCH 256 | |||
| /************************************************************************************** | |||
| * Macro definitions | |||
| **************************************************************************************/ | |||
| .macro SAVE_REGS | |||
| add sp, sp, #-(11 * 16) | |||
| stp d8, d9, [sp, #(0 * 16)] | |||
| stp d10, d11, [sp, #(1 * 16)] | |||
| stp d12, d13, [sp, #(2 * 16)] | |||
| stp d14, d15, [sp, #(3 * 16)] | |||
| stp d16, d17, [sp, #(4 * 16)] | |||
| stp x18, x19, [sp, #(5 * 16)] | |||
| stp x20, x21, [sp, #(6 * 16)] | |||
| stp x22, x23, [sp, #(7 * 16)] | |||
| stp x24, x25, [sp, #(8 * 16)] | |||
| stp x26, x27, [sp, #(9 * 16)] | |||
| str x28, [sp, #(10 * 16)] | |||
| .endm | |||
| .macro RESTORE_REGS | |||
| ldp d8, d9, [sp, #(0 * 16)] | |||
| ldp d10, d11, [sp, #(1 * 16)] | |||
| ldp d12, d13, [sp, #(2 * 16)] | |||
| ldp d14, d15, [sp, #(3 * 16)] | |||
| ldp d16, d17, [sp, #(4 * 16)] | |||
| ldp x18, x19, [sp, #(5 * 16)] | |||
| ldp x20, x21, [sp, #(6 * 16)] | |||
| ldp x22, x23, [sp, #(7 * 16)] | |||
| ldp x24, x25, [sp, #(8 * 16)] | |||
| ldp x26, x27, [sp, #(9 * 16)] | |||
| ldr x28, [sp, #(10 * 16)] | |||
| add sp, sp, #(11*16) | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x8 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| ldp q2, q3, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add TEMP1, B00, #64 | |||
| ldp q4, q5, [A03] | |||
| ldp q6, q7, [A04] | |||
| add A03, A03, #32 | |||
| add A04, A04, #32 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q8, q9, [A05] | |||
| ldp q10, q11, [A06] | |||
| add A05, A05, #32 | |||
| add A06, A06, #32 | |||
| st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| ldp q12, q13, [A07] | |||
| ldp q14, q15, [A08] | |||
| add A07, A07, #32 | |||
| add A08, A08, #32 | |||
| st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| ldr q2, [A03] | |||
| ldr q3, [A04] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| add A03, A03, #16 | |||
| add A04, A04, #16 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] | |||
| add B01, B01, #64 | |||
| ldr q4, [A05] | |||
| ldr q5, [A06] | |||
| ldr q6, [A07] | |||
| ldr q7, [A08] | |||
| add A05, A05, #16 | |||
| add A06, A06, #16 | |||
| add A07, A07, #16 | |||
| add A08, A08, #16 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [B01] | |||
| add B01, B01, #64 | |||
| .endm | |||
| .macro COPY2x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| ldr d2, [A03] | |||
| ldr d3, [A04] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| add A03, A03, #8 | |||
| add A04, A04, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| stp d2, d3, [B02] | |||
| add B02, B02, #16 | |||
| ldr d4, [A05] | |||
| ldr d5, [A06] | |||
| ldr d6, [A07] | |||
| ldr d7, [A08] | |||
| add A05, A05, #8 | |||
| add A06, A06, #8 | |||
| add A07, A07, #8 | |||
| add A08, A08, #8 | |||
| stp d4, d5, [B02] | |||
| add B02, B02, #16 | |||
| stp d6, d7, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x8 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A05, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A06, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A07, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A08, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| ldr s2, [A03] | |||
| ldr s3, [A04] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| add A03, A03, #4 | |||
| add A04, A04, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| stp s2, s3, [B03] | |||
| add B03, B03, #8 | |||
| ldr s4, [A05] | |||
| ldr s5, [A06] | |||
| ldr s6, [A07] | |||
| ldr s7, [A08] | |||
| ldr d4, [A05], #8 | |||
| ldr d5, [A06], #8 | |||
| ldr d6, [A07], #8 | |||
| ldr d7, [A08], #8 | |||
| stp s4, s5, [B03] | |||
| add B03, B03, #8 | |||
| stp s6, s7, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x4 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| ldp q2, q3, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add TEMP1, B00, #64 | |||
| ldp q4, q5, [A03] | |||
| ldp q6, q7, [A04] | |||
| add A03, A03, #32 | |||
| add A04, A04, #32 | |||
| st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [TEMP1] | |||
| add TEMP1, TEMP1, #64 | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| ldr q2, [A03] | |||
| ldr q3, [A04] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| add A03, A03, #16 | |||
| add A04, A04, #16 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B01] | |||
| add B01, B01, #64 | |||
| .endm | |||
| .macro COPY2x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| ldr d2, [A03] | |||
| ldr d3, [A04] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| add A03, A03, #8 | |||
| add A04, A04, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| stp d2, d3, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x4 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A03, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A04, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| ldr s2, [A03] | |||
| ldr s3, [A04] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| add A03, A03, #4 | |||
| add A04, A04, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| stp s2, s3, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x2 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ld1 {v0.4s, v1.4s}, [A01] | |||
| ld1 {v2.4s, v3.4s}, [A02] | |||
| add A01, A01, #32 | |||
| add A02, A02, #32 | |||
| st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [B00] | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| ldr q1, [A02] | |||
| add A01, A01, #16 | |||
| add A02, A02, #16 | |||
| stp q0, q1, [B01] | |||
| add B01, B01, #32 | |||
| .endm | |||
| .macro COPY2x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| ldr d1, [A02] | |||
| add A01, A01, #8 | |||
| add A02, A02, #8 | |||
| stp d0, d1, [B02] | |||
| add B02, B02, #16 | |||
| .endm | |||
| .macro COPY1x2 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| //prfm PLDL1KEEP, [A02, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| ldr s1, [A02] | |||
| add A01, A01, #4 | |||
| add A02, A02, #4 | |||
| stp s0, s1, [B03] | |||
| add B03, B03, #8 | |||
| .endm | |||
| /*************************************************************************************************************************/ | |||
| .macro COPY8x1 | |||
| prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldp q0, q1, [A01] | |||
| add A01, A01, #32 | |||
| stp q0, q1, [B00] | |||
| add B00, B00, M8 | |||
| .endm | |||
| .macro COPY4x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr q0, [A01] | |||
| add A01, A01, #16 | |||
| str q0, [B01] | |||
| add B01, B01, #16 | |||
| .endm | |||
| .macro COPY2x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr d0, [A01] | |||
| add A01, A01, #8 | |||
| str d0, [B02] | |||
| add B02, B02, #8 | |||
| .endm | |||
| .macro COPY1x1 | |||
| //prfm PLDL1KEEP, [A01, #A_PREFETCH] | |||
| ldr s0, [A01] | |||
| add A01, A01, #4 | |||
| str s0, [B03] | |||
| add B03, B03, #4 | |||
| .endm | |||
| /************************************************************************************** | |||
| * End of macro definitions | |||
| **************************************************************************************/ | |||
| PROLOGUE | |||
| .align 5 | |||
| SAVE_REGS | |||
| lsl LDA, LDA, #2 // LDA = LDA * SIZE | |||
| lsl TEMP1, M, #2 // TEMP1 = M * SIZE | |||
| and B01 , N , #-8 | |||
| and B02 , N , #-4 | |||
| and B03 , N , #-2 | |||
| mul B01, B01, TEMP1 | |||
| mul B02, B02, TEMP1 | |||
| mul B03, B03, TEMP1 | |||
| add B01 , B01, B | |||
| add B02 , B02, B | |||
| add B03 , B03, B | |||
| lsl M8, M, #5 // M8 = M * 8 * SIZE | |||
| .Lsgemm_tcopy_L8_BEGIN: | |||
| asr J, M, #3 // J = M / 8 | |||
| cmp J, #0 | |||
| ble .Lsgemm_tcopy_L4_BEGIN | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A05, A04, LDA | |||
| add A06, A05, LDA | |||
| add A07, A06, LDA | |||
| add A08, A07, LDA | |||
| add A, A08, LDA | |||
| mov B00, B | |||
| add B, B00, #256 // B = B + 8 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L8_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L8_M8_20: | |||
| COPY8x8 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L8_M8_20 | |||
| .Lsgemm_tcopy_L8_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L8_M8_60 | |||
| COPY4x8 | |||
| .Lsgemm_tcopy_L8_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L8_M8_80 | |||
| COPY2x8 | |||
| .Lsgemm_tcopy_L8_M8_80: | |||
| tst N, #1 | |||
| ble .Lsgemm_tcopy_L8_M8_END | |||
| COPY1x8 | |||
| .Lsgemm_tcopy_L8_M8_END: | |||
| subs J, J, #1 // j-- | |||
| bne .Lsgemm_tcopy_L8_M8_BEGIN | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L4_BEGIN: | |||
| tst M, #7 | |||
| ble .Lsgemm_tcopy_L999 | |||
| tst M, #4 | |||
| ble .Lsgemm_tcopy_L2_BEGIN | |||
| .Lsgemm_tcopy_L4_M8_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A03, A02, LDA | |||
| add A04, A03, LDA | |||
| add A, A04, LDA | |||
| mov B00, B | |||
| add B, B00, #128 // B = B + 4 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L4_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L4_M8_20: | |||
| COPY8x4 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L4_M8_20 | |||
| .Lsgemm_tcopy_L4_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L4_M8_60 | |||
| COPY4x4 | |||
| .Lsgemm_tcopy_L4_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L4_M8_80 | |||
| COPY2x4 | |||
| .Lsgemm_tcopy_L4_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L4_M8_END | |||
| COPY1x4 | |||
| .Lsgemm_tcopy_L4_M8_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L2_BEGIN: | |||
| tst M, #3 | |||
| ble .Lsgemm_tcopy_L999 | |||
| tst M, #2 | |||
| ble .Lsgemm_tcopy_L1_BEGIN | |||
| .Lsgemm_tcopy_L2_M16_BEGIN: | |||
| mov A01, A | |||
| add A02, A01, LDA | |||
| add A, A02, LDA | |||
| mov B00, B | |||
| add B, B00, #64 // B = B + 2 * 8 * SIZE | |||
| asr I, N, #3 // I = N / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L2_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L2_M8_20: | |||
| COPY8x2 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L2_M8_20 | |||
| .Lsgemm_tcopy_L2_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L2_M8_60 | |||
| COPY4x2 | |||
| .Lsgemm_tcopy_L2_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L2_M8_80 | |||
| COPY2x2 | |||
| .Lsgemm_tcopy_L2_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L2_M8_END | |||
| COPY1x2 | |||
| .Lsgemm_tcopy_L2_M8_END: | |||
| /*********************************************************************************************/ | |||
| .Lsgemm_tcopy_L1_BEGIN: | |||
| tst M, #1 | |||
| ble .Lsgemm_tcopy_L999 | |||
| .Lsgemm_tcopy_L1_M16_BEGIN: | |||
| mov A01, A // A01 = A | |||
| mov B00, B | |||
| asr I, N, #3 // I = M / 8 | |||
| cmp I, #0 | |||
| ble .Lsgemm_tcopy_L1_M8_40 | |||
| .align 5 | |||
| .Lsgemm_tcopy_L1_M8_20: | |||
| COPY8x1 | |||
| subs I , I , #1 | |||
| bne .Lsgemm_tcopy_L1_M8_20 | |||
| .Lsgemm_tcopy_L1_M8_40: | |||
| tst N , #4 | |||
| ble .Lsgemm_tcopy_L1_M8_60 | |||
| COPY4x1 | |||
| .Lsgemm_tcopy_L1_M8_60: | |||
| tst N , #2 | |||
| ble .Lsgemm_tcopy_L1_M8_80 | |||
| COPY2x1 | |||
| .Lsgemm_tcopy_L1_M8_80: | |||
| tst N , #1 | |||
| ble .Lsgemm_tcopy_L1_M8_END | |||
| COPY1x1 | |||
| .Lsgemm_tcopy_L1_M8_END: | |||
| .Lsgemm_tcopy_L999: | |||
| mov x0, #0 // set return value | |||
| RESTORE_REGS | |||
| ret | |||
| EPILOGUE | |||
| @@ -39,7 +39,7 @@ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, | |||
| FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, | |||
| IFLOAT *dummy2, BLASLONG dummy3, IFLOAT *dummy4, BLASLONG dummy5, | |||
| FLOAT *c, BLASLONG ldc){ | |||
| @@ -39,24 +39,24 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| FLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *aoffset9, *aoffset10, *aoffset11, *aoffset12; | |||
| IFLOAT *aoffset13, *aoffset14, *aoffset15, *aoffset16; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,10 +39,10 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2; | |||
| FLOAT *b_offset; | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| @@ -39,30 +39,30 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| @@ -39,22 +39,22 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2; | |||
| FLOAT *boffset; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2; | |||
| IFLOAT *boffset; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -39,11 +39,11 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *a_offset, *a_offset1, *a_offset2; | |||
| FLOAT *b_offset, *b_offset1, *b_offset2; | |||
| IFLOAT *a_offset, *a_offset1, *a_offset2; | |||
| IFLOAT *b_offset, *b_offset1, *b_offset2; | |||
| a_offset = a; | |||
| b_offset = b; | |||
| @@ -39,32 +39,32 @@ | |||
| #include <stdio.h> | |||
| #include "common.h" | |||
| int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b){ | |||
| int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ | |||
| BLASLONG i, j; | |||
| FLOAT *aoffset; | |||
| FLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| FLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| FLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| FLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| FLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| FLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| FLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| FLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| FLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| FLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| FLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| FLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| FLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| FLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| FLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| FLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| FLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| FLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| FLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| IFLOAT *aoffset; | |||
| IFLOAT *aoffset1, *aoffset2, *aoffset3, *aoffset4; | |||
| IFLOAT *aoffset5, *aoffset6, *aoffset7, *aoffset8; | |||
| IFLOAT *boffset, *boffset1, *boffset2, *boffset3, *boffset4; | |||
| IFLOAT ctemp01, ctemp02, ctemp03, ctemp04; | |||
| IFLOAT ctemp05, ctemp06, ctemp07, ctemp08; | |||
| IFLOAT ctemp09, ctemp10, ctemp11, ctemp12; | |||
| IFLOAT ctemp13, ctemp14, ctemp15, ctemp16; | |||
| IFLOAT ctemp17, ctemp18, ctemp19, ctemp20; | |||
| IFLOAT ctemp21, ctemp22, ctemp23, ctemp24; | |||
| IFLOAT ctemp25, ctemp26, ctemp27, ctemp28; | |||
| IFLOAT ctemp29, ctemp30, ctemp31, ctemp32; | |||
| IFLOAT ctemp33, ctemp34, ctemp35, ctemp36; | |||
| IFLOAT ctemp37, ctemp38, ctemp39, ctemp40; | |||
| IFLOAT ctemp41, ctemp42, ctemp43, ctemp44; | |||
| IFLOAT ctemp45, ctemp46, ctemp47, ctemp48; | |||
| IFLOAT ctemp49, ctemp50, ctemp51, ctemp52; | |||
| IFLOAT ctemp53, ctemp54, ctemp55, ctemp56; | |||
| IFLOAT ctemp57, ctemp58, ctemp59, ctemp60; | |||
| IFLOAT ctemp61, ctemp62, ctemp63, ctemp64; | |||
| aoffset = a; | |||
| boffset = b; | |||
| @@ -1,13 +1,32 @@ | |||
| #include "common.h" | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #if defined(HALF) && defined(HALFCONVERSION) | |||
| static float | |||
| bfloat16tof32 (bfloat16 f16) | |||
| { | |||
| float result = 0; | |||
| unsigned short* q = (unsigned short*)(&result); | |||
| #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ | |||
| q[0] = f16; | |||
| #else | |||
| q[1] = f16; | |||
| #endif | |||
| return result; | |||
| } | |||
| #define BF16TOF32(x) (bfloat16tof32(x)) | |||
| #else | |||
| #define BF16TOF32(x) x | |||
| #endif | |||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,IFLOAT* ba,IFLOAT* bb,FLOAT* C,BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| ,BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG i,j,k; | |||
| FLOAT *C0,*C1,*ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3,load0,load1,load2,load3,load4,load5,load6,load7; | |||
| FLOAT *C0,*C1; | |||
| IFLOAT *ptrba,*ptrbb; | |||
| FLOAT res0,res1,res2,res3; | |||
| IFLOAT load0,load1,load2,load3,load4,load5,load6,load7; | |||
| for (j=0; j<bn/2; j+=1) | |||
| { | |||
| C0 = C; | |||
| @@ -24,36 +43,36 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| res1 = res1+BF16TOF32(load2)*BF16TOF32(load1); | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| res2 = res2+BF16TOF32(load0)*BF16TOF32(load3); | |||
| res3 = res3+BF16TOF32(load2)*BF16TOF32(load3); | |||
| load4 = ptrba[2*1+0]; | |||
| load5 = ptrbb[2*1+0]; | |||
| res0 = res0+load4*load5; | |||
| res0 = res0+BF16TOF32(load4)*BF16TOF32(load5); | |||
| load6 = ptrba[2*1+1]; | |||
| res1 = res1+load6*load5; | |||
| res1 = res1+BF16TOF32(load6)*BF16TOF32(load5); | |||
| load7 = ptrbb[2*1+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| res2 = res2+BF16TOF32(load4)*BF16TOF32(load7); | |||
| res3 = res3+BF16TOF32(load6)*BF16TOF32(load7); | |||
| load0 = ptrba[2*2+0]; | |||
| load1 = ptrbb[2*2+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| load2 = ptrba[2*2+1]; | |||
| res1 = res1+load2*load1; | |||
| res1 = res1+BF16TOF32(load2)*BF16TOF32(load1); | |||
| load3 = ptrbb[2*2+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| res2 = res2+BF16TOF32(load0)*BF16TOF32(load3); | |||
| res3 = res3+BF16TOF32(load2)*BF16TOF32(load3); | |||
| load4 = ptrba[2*3+0]; | |||
| load5 = ptrbb[2*3+0]; | |||
| res0 = res0+load4*load5; | |||
| res0 = res0+BF16TOF32(load4)*BF16TOF32(load5); | |||
| load6 = ptrba[2*3+1]; | |||
| res1 = res1+load6*load5; | |||
| res1 = res1+BF16TOF32(load6)*BF16TOF32(load5); | |||
| load7 = ptrbb[2*3+1]; | |||
| res2 = res2+load4*load7; | |||
| res3 = res3+load6*load7; | |||
| res2 = res2+BF16TOF32(load4)*BF16TOF32(load7); | |||
| res3 = res3+BF16TOF32(load6)*BF16TOF32(load7); | |||
| ptrba = ptrba+8; | |||
| ptrbb = ptrbb+8; | |||
| } | |||
| @@ -61,12 +80,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| res1 = res1+BF16TOF32(load2)*BF16TOF32(load1); | |||
| load3 = ptrbb[2*0+1]; | |||
| res2 = res2+load0*load3; | |||
| res3 = res3+load2*load3; | |||
| res2 = res2+BF16TOF32(load0)*BF16TOF32(load3); | |||
| res3 = res3+BF16TOF32(load2)*BF16TOF32(load3); | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| @@ -90,9 +109,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[2*0+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| load2 = ptrbb[2*0+1]; | |||
| res1 = res1+load0*load2; | |||
| res1 = res1+BF16TOF32(load0)*BF16TOF32(load2); | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+2; | |||
| } | |||
| @@ -121,9 +140,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| { | |||
| load0 = ptrba[2*0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| load2 = ptrba[2*0+1]; | |||
| res1 = res1+load2*load1; | |||
| res1 = res1+BF16TOF32(load2)*BF16TOF32(load1); | |||
| ptrba = ptrba+2; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| @@ -141,7 +160,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL | |||
| { | |||
| load0 = ptrba[0+0]; | |||
| load1 = ptrbb[0+0]; | |||
| res0 = res0+load0*load1; | |||
| res0 = res0+BF16TOF32(load0)*BF16TOF32(load1); | |||
| ptrba = ptrba+1; | |||
| ptrbb = ptrbb+1; | |||
| } | |||
| @@ -0,0 +1 @@ | |||
| include $(KERNELDIR)/KERNEL.P5600 | |||
| @@ -0,0 +1,225 @@ | |||
| ifeq ($(__BYTE_ORDER__),__ORDER_BIG_ENDIAN__) | |||
| include $(KERNELDIR)/KERNEL.POWER8 | |||
| else | |||
| #SGEMM_BETA = ../generic/gemm_beta.c | |||
| #DGEMM_BETA = ../generic/gemm_beta.c | |||
| #CGEMM_BETA = ../generic/zgemm_beta.c | |||
| #ZGEMM_BETA = ../generic/zgemm_beta.c | |||
| SHGEMM_BETA = ../generic/gemm_beta.c | |||
| SHGEMMKERNEL = shgemm_kernel_power10.c | |||
| SHGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SHGEMMITCOPY = ../generic/gemm_tcopy_16.c | |||
| SHGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c | |||
| SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| STRMMKERNEL = sgemm_kernel_power10.c | |||
| DTRMMKERNEL = dgemm_kernel_power10.c | |||
| CTRMMKERNEL = cgemm_kernel_power10.S | |||
| ZTRMMKERNEL = zgemm_kernel_power10.S | |||
| SGEMMKERNEL = sgemm_kernel_power10.c | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMKERNEL = dgemm_kernel_power10.c | |||
| DGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| DGEMMITCOPY = dgemm_tcopy_16_power8.S | |||
| DGEMMONCOPY = dgemm_ncopy_4_power8.S | |||
| DGEMMOTCOPY = ../generic/gemm_tcopy_4.c | |||
| DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = cgemm_kernel_power10.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_4.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_power10.S | |||
| ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| ZGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| ZGEMMITCOPY = zgemm_tcopy_8_power8.S | |||
| ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S | |||
| DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c | |||
| ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c | |||
| ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c | |||
| ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c | |||
| #Todo: CGEMM3MKERNEL should be 4x4 blocksizes. | |||
| #CGEMM3MKERNEL = zgemm3m_kernel_8x4_sse3.S | |||
| #ZGEMM3MKERNEL = zgemm3m_kernel_4x4_sse3.S | |||
| #Pure C for other kernels | |||
| #SAMAXKERNEL = ../arm/amax.c | |||
| #DAMAXKERNEL = ../arm/amax.c | |||
| #CAMAXKERNEL = ../arm/zamax.c | |||
| #ZAMAXKERNEL = ../arm/zamax.c | |||
| # | |||
| #SAMINKERNEL = ../arm/amin.c | |||
| #DAMINKERNEL = ../arm/amin.c | |||
| #CAMINKERNEL = ../arm/zamin.c | |||
| #ZAMINKERNEL = ../arm/zamin.c | |||
| # | |||
| #SMAXKERNEL = ../arm/max.c | |||
| #DMAXKERNEL = ../arm/max.c | |||
| # | |||
| #SMINKERNEL = ../arm/min.c | |||
| #DMINKERNEL = ../arm/min.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMAXKERNEL = isamax_power9.S | |||
| else | |||
| ISAMAXKERNEL = isamax.c | |||
| endif | |||
| IDAMAXKERNEL = idamax.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMAXKERNEL = icamax_power9.S | |||
| else | |||
| ICAMAXKERNEL = icamax.c | |||
| endif | |||
| IZAMAXKERNEL = izamax.c | |||
| # | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ISAMINKERNEL = isamin_power9.S | |||
| else | |||
| ISAMINKERNEL = isamin.c | |||
| endif | |||
| IDAMINKERNEL = idamin.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| ICAMINKERNEL = icamin_power9.S | |||
| else | |||
| ICAMINKERNEL = icamin.c | |||
| endif | |||
| IZAMINKERNEL = izamin.c | |||
| # | |||
| #ISMAXKERNEL = ../arm/imax.c | |||
| #IDMAXKERNEL = ../arm/imax.c | |||
| # | |||
| #ISMINKERNEL = ../arm/imin.c | |||
| #IDMINKERNEL = ../arm/imin.c | |||
| # | |||
| SASUMKERNEL = sasum.c | |||
| DASUMKERNEL = dasum.c | |||
| CASUMKERNEL = casum.c | |||
| ZASUMKERNEL = zasum.c | |||
| # | |||
| SAXPYKERNEL = saxpy.c | |||
| DAXPYKERNEL = daxpy.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CAXPYKERNEL = caxpy_power9.S | |||
| else | |||
| CAXPYKERNEL = caxpy.c | |||
| endif | |||
| ZAXPYKERNEL = zaxpy.c | |||
| # | |||
| SCOPYKERNEL = scopy.c | |||
| DCOPYKERNEL = dcopy.c | |||
| CCOPYKERNEL = ccopy.c | |||
| ZCOPYKERNEL = zcopy.c | |||
| # | |||
| SDOTKERNEL = sdot.c | |||
| DDOTKERNEL = ddot.c | |||
| DSDOTKERNEL = sdot.c | |||
| ifneq ($(GCCVERSIONGTEQ9),1) | |||
| CDOTKERNEL = cdot_power9.S | |||
| else | |||
| CDOTKERNEL = cdot.c | |||
| endif | |||
| ZDOTKERNEL = zdot.c | |||
| # | |||
| SNRM2KERNEL = ../arm/nrm2.c | |||
| DNRM2KERNEL = ../arm/nrm2.c | |||
| CNRM2KERNEL = ../arm/znrm2.c | |||
| ZNRM2KERNEL = ../arm/znrm2.c | |||
| # | |||
| SROTKERNEL = srot.c | |||
| DROTKERNEL = drot.c | |||
| CROTKERNEL = crot.c | |||
| ZROTKERNEL = zrot.c | |||
| # | |||
| SSCALKERNEL = sscal.c | |||
| DSCALKERNEL = dscal.c | |||
| CSCALKERNEL = zscal.c | |||
| ZSCALKERNEL = zscal.c | |||
| # | |||
| SSWAPKERNEL = sswap.c | |||
| DSWAPKERNEL = dswap.c | |||
| CSWAPKERNEL = cswap.c | |||
| ZSWAPKERNEL = zswap.c | |||
| # | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| CGEMVNKERNEL = cgemv_n.c | |||
| ZGEMVNKERNEL = zgemv_n_4.c | |||
| # | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| CGEMVTKERNEL = cgemv_t.c | |||
| ZGEMVTKERNEL = zgemv_t_4.c | |||
| #SSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #SSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #DSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_U_KERNEL = ../generic/symv_k.c | |||
| #QSYMV_L_KERNEL = ../generic/symv_k.c | |||
| #CSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #CSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #ZSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_U_KERNEL = ../generic/zsymv_k.c | |||
| #XSYMV_L_KERNEL = ../generic/zsymv_k.c | |||
| #ZHEMV_U_KERNEL = ../generic/zhemv_k.c | |||
| #ZHEMV_L_KERNEL = ../generic/zhemv_k.c | |||
| LSAME_KERNEL = ../generic/lsame.c | |||
| SCABS_KERNEL = ../generic/cabs.c | |||
| DCABS_KERNEL = ../generic/cabs.c | |||
| QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| endif | |||
| @@ -12,7 +12,7 @@ SGEMMKERNEL = sgemm_kernel_16x8_power8.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -232,3 +232,11 @@ QCABS_KERNEL = ../generic/cabs.c | |||
| #Dump kernel | |||
| CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c | |||
| ifeq ($(__BYTE_ORDER__)$(ELF_VERSION),__ORDER_BIG_ENDIAN__2) | |||
| IDAMAXKERNEL = ../arm/iamax.c | |||
| IDAMINKERNEL = ../arm/iamin.c | |||
| IZAMAXKERNEL = ../arm/izamax.c | |||
| IZAMINKERNEL = ../arm/izamin.c | |||
| endif | |||
| @@ -16,7 +16,7 @@ SGEMMKERNEL = sgemm_kernel_power9.S | |||
| SGEMMINCOPY = ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY = sgemm_tcopy_16_power8.S | |||
| SGEMMONCOPY = ../generic/gemm_ncopy_8.c | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMOTCOPY = sgemm_tcopy_8_power8.S | |||
| SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| @@ -20,8 +20,10 @@ ZAXPYKERNEL = zaxpy_ppc440.S | |||
| SDOTKERNEL = dot_ppc440.S | |||
| DDOTKERNEL = dot_ppc440.S | |||
| CDOTKERNEL = zdot_ppc440.S | |||
| ZDOTKERNEL = zdot_ppc440.S | |||
| #CDOTKERNEL = zdot_ppc440.S | |||
| #ZDOTKERNEL = zdot_ppc440.S | |||
| CDOTKERNEL = ../arm/zdot.c | |||
| ZDOTKERNEL = ../arm/zdot.c | |||
| ISAMAXKERNEL = iamax_ppc440.S | |||
| IDAMAXKERNEL = iamax_ppc440.S | |||
| @@ -52,8 +54,11 @@ ZNRM2KERNEL = znrm2_ppc440.S | |||
| SROTKERNEL = rot_ppc440.S | |||
| DROTKERNEL = rot_ppc440.S | |||
| CROTKERNEL = zrot_ppc440.S | |||
| ZROTKERNEL = zrot_ppc440.S | |||
| #CROTKERNEL = zrot_ppc440.S | |||
| #ZROTKERNEL = zrot_ppc440.S | |||
| CROTKERNEL = ../arm/zrot.c | |||
| ZROTKERNEL = ../arm/zrot.c | |||
| SSCALKERNEL = scal_ppc440.S | |||
| DSCALKERNEL = scal_ppc440.S | |||
| @@ -78,13 +83,18 @@ DGEMMINCOPYOBJ = | |||
| DGEMMITCOPYOBJ = | |||
| DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMKERNEL = zgemm_kernel_altivec_g4.S | |||
| CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| #CGEMMKERNEL = zgemm_kernel_altivec_g4.S | |||
| #CGEMMINCOPY = ../generic/zgemm_ncopy_8.c | |||
| #CGEMMITCOPY = ../generic/zgemm_tcopy_8.c | |||
| CGEMMKERNEL = zgemm_kernel.S | |||
| CGEMMINCOPY = | |||
| CGEMMONCOPY = | |||
| CGEMMONCOPY = ../generic/zgemm_ncopy_2.c | |||
| CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c | |||
| CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMINCOPYOBJ = | |||
| #cgemm_incopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMITCOPYOBJ = | |||
| #cgemm_itcopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) | |||
| CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) | |||
| ZGEMMKERNEL = zgemm_kernel_g4.S | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "casum_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "ccopy_microk_power8.c" | |||
| #endif | |||
| @@ -424,7 +424,7 @@ L999: | |||
| lwz r16, 204(SP) | |||
| lwz r15, 208(SP) | |||
| lwz r14, 212(SP) | |||
| addi r11, 224 | |||
| addi r11, SP, 224 | |||
| #endif | |||
| lvx v20, r11, r0 | |||
| addi r11, r11, 16 | |||
| @@ -459,4 +459,4 @@ L999: | |||
| blr | |||
| EPILOGUE | |||
| #endif^ | |||
| #endif | |||
| @@ -0,0 +1,286 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2013-2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define ASSEMBLER | |||
| #include "common.h" | |||
| #include "def_vsx.h" | |||
| #define LOAD ld | |||
| #define STACKSIZE (512 ) | |||
| #define FLINK_SAVE (STACKSIZE+16) /* 16($r12) */ | |||
| #define M r3 | |||
| #define N r4 | |||
| #define K r5 | |||
| #define A r8 | |||
| #define B r9 | |||
| #define C r10 | |||
| #define LDC r6 | |||
| #define OFFSET r7 | |||
| #define alpha_r vs51 | |||
| #define alpha_i vs55 | |||
| #define save_permute_1 vs59 | |||
| #define permute_mask vs63 | |||
| #define o0 0 | |||
| #define T1 r11 | |||
| #define T2 r12 | |||
| #define T3 r14 | |||
| #define T4 r15 | |||
| #define T5 r16 | |||
| #define T6 r17 | |||
| #define L r18 | |||
| #define T7 r19 | |||
| #define T8 r20 | |||
| #define TEMP_REG r21 | |||
| #define I r22 | |||
| #define J r23 | |||
| #define AO r24 | |||
| #define BO r25 | |||
| #define CO r26 | |||
| #define T9 r27 | |||
| #define T10 r28 | |||
| #define PRE r29 | |||
| #define T12 r30 | |||
| #define T13 r31 | |||
| #include "cgemm_macros_power10.S" | |||
| .equ perm_const1, 0x0405060700010203 | |||
| .equ perm_const2, 0x0c0d0e0f08090a0b | |||
| .equ save_permute_12, 0x0c0d0e0f1c1d1e1f | |||
| .equ save_permute_11, 0x0405060714151617 | |||
| #ifndef NEEDPARAM | |||
| PROLOGUE | |||
| PROFCODE | |||
| addi SP, SP, -STACKSIZE | |||
| mflr r0 | |||
| stfd f14, 0(SP) | |||
| stfd f15, 8(SP) | |||
| stfd f16, 16(SP) | |||
| stfd f17, 24(SP) | |||
| stfd f18, 32(SP) | |||
| stfd f19, 40(SP) | |||
| stfd f20, 48(SP) | |||
| stfd f21, 56(SP) | |||
| stfd f22, 64(SP) | |||
| stfd f23, 72(SP) | |||
| stfd f24, 80(SP) | |||
| stfd f25, 88(SP) | |||
| stfd f26, 96(SP) | |||
| stfd f27, 104(SP) | |||
| stfd f28, 112(SP) | |||
| stfd f29, 120(SP) | |||
| stfd f30, 128(SP) | |||
| stfd f31, 136(SP) | |||
| std r31, 144(SP) | |||
| std r30, 152(SP) | |||
| std r29, 160(SP) | |||
| std r28, 168(SP) | |||
| std r27, 176(SP) | |||
| std r26, 184(SP) | |||
| std r25, 192(SP) | |||
| std r24, 200(SP) | |||
| std r23, 208(SP) | |||
| std r22, 216(SP) | |||
| std r21, 224(SP) | |||
| std r20, 232(SP) | |||
| std r19, 240(SP) | |||
| std r18, 248(SP) | |||
| std r17, 256(SP) | |||
| std r16, 264(SP) | |||
| std r15, 272(SP) | |||
| std r14, 280(SP) | |||
| stxv vs52, 288(SP) | |||
| stxv vs53, 304(SP) | |||
| stxv vs54, 320(SP) | |||
| stxv vs55, 336(SP) | |||
| stxv vs56, 352(SP) | |||
| stxv vs57, 368(SP) | |||
| stxv vs58, 384(SP) | |||
| stxv vs59, 400(SP) | |||
| stxv vs60, 416(SP) | |||
| stxv vs61, 432(SP) | |||
| stxv vs62, 448(SP) | |||
| stxv vs63, 464(SP) | |||
| std r0, FLINK_SAVE(SP) | |||
| ld LDC, FRAMESLOT(0) + STACKSIZE(SP) | |||
| #ifdef TRMMKERNEL | |||
| ld OFFSET, FRAMESLOT(1) + STACKSIZE(SP) | |||
| #endif | |||
| slwi LDC, LDC, ZBASE_SHIFT | |||
| /*alpha is stored in f1. convert to single and splat*/ | |||
| xscvdpspn alpha_r,vs1 | |||
| xscvdpspn alpha_i,vs2 | |||
| xxspltw alpha_r,alpha_r,0 | |||
| xxspltw alpha_i,alpha_i,0 | |||
| /*load reverse permute mask for big endian | |||
| uint128 = 0xc0d0e0f08090a0b0405060700010203 | |||
| */ | |||
| lis T2, perm_const2@highest | |||
| lis T1, perm_const1@highest | |||
| lis T3, save_permute_12@highest | |||
| lis T4, save_permute_11@highest | |||
| ori T2, T2, perm_const2@higher | |||
| ori T1, T1, perm_const1@higher | |||
| ori T3, T3, save_permute_12@higher | |||
| ori T4, T4, save_permute_11@higher | |||
| rldicr T2, T2, 32, 31 | |||
| rldicr T1, T1, 32, 31 | |||
| rldicr T3, T3, 32, 31 | |||
| rldicr T4, T4, 32, 31 | |||
| oris T2, T2, perm_const2@h | |||
| oris T1, T1, perm_const1@h | |||
| oris T3, T3, save_permute_12@h | |||
| oris T4, T4, save_permute_11@h | |||
| ori T2, T2, perm_const2@l | |||
| ori T1, T1, perm_const1@l | |||
| ori T3, T3, save_permute_12@l | |||
| ori T4, T4, save_permute_11@l | |||
| li r0,0 | |||
| li PRE,512 | |||
| #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | |||
| /*negate for this case as we will use addition -1*(a+b) */ | |||
| xvnegsp alpha_r,alpha_r | |||
| xvnegsp alpha_i,alpha_i | |||
| #endif | |||
| mtvsrdd permute_mask,T2,T1 | |||
| mtvsrdd save_permute_1,T3,T4 | |||
| /*mask is reverse permute so we have to make it inner permute */ | |||
| xxpermdi permute_mask, permute_mask, permute_mask,2 | |||
| #include "cgemm_logic_power10.S" | |||
| .L999: | |||
| lfd f14, 0(SP) | |||
| lfd f15, 8(SP) | |||
| lfd f16, 16(SP) | |||
| lfd f17, 24(SP) | |||
| lfd f18, 32(SP) | |||
| lfd f19, 40(SP) | |||
| lfd f20, 48(SP) | |||
| lfd f21, 56(SP) | |||
| lfd f22, 64(SP) | |||
| lfd f23, 72(SP) | |||
| lfd f24, 80(SP) | |||
| lfd f25, 88(SP) | |||
| lfd f26, 96(SP) | |||
| lfd f27, 104(SP) | |||
| lfd f28, 112(SP) | |||
| lfd f29, 120(SP) | |||
| lfd f30, 128(SP) | |||
| lfd f31, 136(SP) | |||
| ld r31, 144(SP) | |||
| ld r30, 152(SP) | |||
| ld r29, 160(SP) | |||
| ld r28, 168(SP) | |||
| ld r27, 176(SP) | |||
| ld r26, 184(SP) | |||
| ld r25, 192(SP) | |||
| ld r24, 200(SP) | |||
| ld r23, 208(SP) | |||
| ld r22, 216(SP) | |||
| ld r21, 224(SP) | |||
| ld r20, 232(SP) | |||
| ld r19, 240(SP) | |||
| ld r18, 248(SP) | |||
| ld r17, 256(SP) | |||
| ld r16, 264(SP) | |||
| ld r15, 272(SP) | |||
| ld r14, 280(SP) | |||
| ld r0, FLINK_SAVE(SP) | |||
| lxv vs52, 288(SP) | |||
| lxv vs53, 304(SP) | |||
| lxv vs54, 320(SP) | |||
| lxv vs55, 336(SP) | |||
| lxv vs56, 352(SP) | |||
| lxv vs57, 368(SP) | |||
| lxv vs58, 384(SP) | |||
| lxv vs59, 400(SP) | |||
| mtlr r0 | |||
| lxv vs60, 416(SP) | |||
| lxv vs61, 432(SP) | |||
| lxv vs62, 448(SP) | |||
| lxv vs63, 464(SP) | |||
| addi SP, SP, STACKSIZE | |||
| blr | |||
| EPILOGUE | |||
| #endif | |||
| @@ -27,7 +27,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| static void crot_kernel_8 (long n, float *x, float *y, float c, float s) | |||
| { | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "cswap_microk_power8.c" | |||
| #endif | |||
| @@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #endif | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dasum_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "daxpy_microk_power8.c" | |||
| #endif | |||
| @@ -35,7 +35,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dcopy_microk_power8.c" | |||
| #endif | |||
| @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "ddot_microk_power8.c" | |||
| #endif | |||
| @@ -0,0 +1,864 @@ | |||
| /********************************************************************************* | |||
| Copyright (c) 2020, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| **********************************************************************************/ | |||
| #include "common.h" | |||
| #include <altivec.h> | |||
| typedef unsigned char vec_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); | |||
| typedef FLOAT v2sf_t __attribute__ ((vector_size (8))); | |||
| #ifdef TRMMKERNEL | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] = result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] = result[0] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] = result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] = result[2] * alpha; | |||
| #else | |||
| #define SAVE_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[2*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[3*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| #define SAVE_ACC1(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[4* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[5*ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[6*ldc+J]; \ | |||
| rowC[0] += result[1] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[7*ldc+J]; \ | |||
| rowC[0] += result[0] * alpha; | |||
| #define SAVE2x4_ACC(ACC, J) \ | |||
| __builtin_mma_disassemble_acc (result, ACC); \ | |||
| rowC = (v4sf_t *) &CO[0* ldc+J]; \ | |||
| rowC[0] += result[3] * alpha; \ | |||
| rowC = (v4sf_t *) &CO[1* ldc+J]; \ | |||
| rowC[0] += result[2] * alpha; | |||
| #endif | |||
| #define SET_ACC_ZERO4() \ | |||
| __builtin_mma_xxsetaccz (&acc0); \ | |||
| __builtin_mma_xxsetaccz (&acc1); \ | |||
| __builtin_mma_xxsetaccz (&acc2); \ | |||
| __builtin_mma_xxsetaccz (&acc3); | |||
| #define SET_ACC_ZERO8() \ | |||
| __builtin_mma_xxsetaccz (&acc0); \ | |||
| __builtin_mma_xxsetaccz (&acc1); \ | |||
| __builtin_mma_xxsetaccz (&acc2); \ | |||
| __builtin_mma_xxsetaccz (&acc3); \ | |||
| __builtin_mma_xxsetaccz (&acc4); \ | |||
| __builtin_mma_xxsetaccz (&acc5); \ | |||
| __builtin_mma_xxsetaccz (&acc6); \ | |||
| __builtin_mma_xxsetaccz (&acc7); | |||
| #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); | |||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = k - off; | |||
| #elif defined(LEFT) | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = off + x; | |||
| #else | |||
| #define REFRESH_TEMP_BK(x, y) \ | |||
| temp = off + y; | |||
| #endif | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| #define REFRESH_POINTERS(x, y) \ | |||
| BO = B; \ | |||
| REFRESH_TEMP_BK(x, y) | |||
| #else | |||
| #define REFRESH_POINTERS(x, y) \ | |||
| AO += off * x; \ | |||
| BO = B + off * y; \ | |||
| REFRESH_TEMP_BK(x, y) | |||
| #endif | |||
| #ifdef LEFT | |||
| #define REFRESH_OFF(x) \ | |||
| off += x; | |||
| #else | |||
| #define REFRESH_OFF(x) | |||
| #endif | |||
| #ifdef LEFT | |||
| #define UPDATE_TEMP(x, y) \ | |||
| temp -= x; | |||
| #else | |||
| #define UPDATE_TEMP(x, y) \ | |||
| temp -= y; | |||
| #endif | |||
| #if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | |||
| #define REFRESH_TMP_AFTER_SAVE(x, y) \ | |||
| temp = k - off; \ | |||
| UPDATE_TEMP(x, y) \ | |||
| AO += temp * x; \ | |||
| BO += temp * y; | |||
| #else | |||
| #define REFRESH_TMP_AFTER_SAVE(x, y) | |||
| #endif | |||
| #define REFRESH_AFTER_SAVE(x,y) \ | |||
| REFRESH_TMP_AFTER_SAVE(x, y) \ | |||
| REFRESH_OFF(x) | |||
| /************************************************************************************* | |||
| * GEMM Kernel | |||
| *************************************************************************************/ | |||
| int | |||
| CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, | |||
| FLOAT * C, BLASLONG ldc | |||
| #ifdef TRMMKERNEL | |||
| , BLASLONG offset | |||
| #endif | |||
| ) | |||
| { | |||
| BLASLONG N = n; | |||
| BLASLONG i1; | |||
| #if defined(TRMMKERNEL) | |||
| BLASLONG off; | |||
| #endif | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off = -offset; | |||
| #endif | |||
| v4sf_t valpha = { alpha, alpha }; | |||
| N = n >> 2; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| CO = C; | |||
| C += ldc << 2; | |||
| AO = A; | |||
| PREFETCH1 (A, 128); | |||
| PREFETCH1 (A, 256); | |||
| i = m >> 4; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 4); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| BLASLONG l = 0; | |||
| PREFETCH1 (CO, 0); | |||
| PREFETCH1 (CO + ldc, 0); | |||
| PREFETCH1 (CO + ldc + ldc, 0); | |||
| PREFETCH1 (CO + ldc + ldc + ldc, 0); | |||
| PREFETCH1 (CO, 128); | |||
| PREFETCH1 (CO + ldc, 128); | |||
| PREFETCH1 (CO + ldc + ldc, 128); | |||
| PREFETCH1 (CO + ldc + ldc + ldc, 128); | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| SET_ACC_ZERO8 (); | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| vec_t *rowA = (vec_t *) & AO[l << 4]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC (&acc2, 4); | |||
| SAVE_ACC (&acc1, 2); | |||
| SAVE_ACC (&acc3, 6); | |||
| SAVE_ACC (&acc4, 8); | |||
| SAVE_ACC (&acc6, 12); | |||
| SAVE_ACC (&acc5, 10); | |||
| SAVE_ACC (&acc7, 14); | |||
| AO += temp << 4; | |||
| BO += temp << 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 4) | |||
| #endif | |||
| CO += 16; | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (8, 4); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| SET_ACC_ZERO4 (); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| vec_t *rowA = (vec_t *) & AO[l << 3]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC (&acc2, 4); | |||
| SAVE_ACC (&acc1, 2); | |||
| SAVE_ACC (&acc3, 6); | |||
| CO += 8; | |||
| AO += temp << 3; | |||
| BO += temp << 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 4) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 4); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| __builtin_mma_xxsetaccz (&acc1); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| vec_t *rowA = (vec_t *) & AO[l << 2]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| SAVE_ACC (&acc1, 2); | |||
| CO += 4; | |||
| AO += temp << 2; | |||
| BO += temp << 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 4) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 4); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| vec_t *rowA = (vec_t *) & AO[l << 1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & BO[l << 2]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| } | |||
| SAVE_ACC (&acc0, 0); | |||
| CO += 2; | |||
| AO += temp << 1; | |||
| BO += temp << 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 4) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 4); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowA = { AO[l], AO[l] }; | |||
| v4sf_t rowB = { BO[l << 2], BO[(l << 2) + 1] }; | |||
| v4sf_t rowB1 = { BO[(l << 2) + 2], BO[(l << 2) + 3] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA * rowB1; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0 * ldc] = t[0]; | |||
| CO[1 * ldc] = t[1]; | |||
| CO[2 * ldc] = t1[0]; | |||
| CO[3 * ldc] = t1[1]; | |||
| #else | |||
| CO[0 * ldc] += t[0]; | |||
| CO[1 * ldc] += t[1]; | |||
| CO[2 * ldc] += t1[0]; | |||
| CO[3 * ldc] += t1[1]; | |||
| #endif | |||
| CO += 1; | |||
| AO += temp; | |||
| BO += temp << 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 4) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 4; // number of values in A | |||
| #endif | |||
| B += k << 2; | |||
| } | |||
| N = (n & 3) >> 1; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| { | |||
| BLASLONG i, j, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| CO = C; | |||
| C += ldc << 1; | |||
| AO = A; | |||
| i = m >> 4; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; | |||
| SET_ACC_ZERO8 (); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[l << 4]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| __builtin_mma_xvf64gerpp (&acc4, rowB, rowA[4]); | |||
| __builtin_mma_xvf64gerpp (&acc5, rowB, rowA[5]); | |||
| __builtin_mma_xvf64gerpp (&acc6, rowB, rowA[6]); | |||
| __builtin_mma_xvf64gerpp (&acc7, rowB, rowA[7]); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 2); | |||
| SAVE2x4_ACC (&acc2, 4); | |||
| SAVE2x4_ACC (&acc3, 6); | |||
| SAVE2x4_ACC (&acc4, 8); | |||
| SAVE2x4_ACC (&acc5, 10); | |||
| SAVE2x4_ACC (&acc6, 12); | |||
| SAVE2x4_ACC (&acc7, 14); | |||
| CO += 16; | |||
| AO += temp << 4; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 2) | |||
| #endif | |||
| } | |||
| i = (m & 15) >> 3; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (8, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1, acc2, acc3; | |||
| SET_ACC_ZERO4 (); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[l << 3]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); | |||
| __builtin_mma_xvf64gerpp (&acc3, rowB, rowA[3]); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 2); | |||
| SAVE2x4_ACC (&acc2, 4); | |||
| SAVE2x4_ACC (&acc3, 6); | |||
| CO += 8; | |||
| AO += temp << 3; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 2) | |||
| #endif | |||
| } | |||
| i = (m & 7) >> 2; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0, acc1; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| __builtin_mma_xxsetaccz (&acc1); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[l << 2]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| SAVE2x4_ACC (&acc1, 2); | |||
| CO += 4; | |||
| AO += temp << 2; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 2) | |||
| #endif | |||
| } | |||
| i = (m & 3) >> 1; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| v4sf_t *rowC; | |||
| v4sf_t result[4]; | |||
| __vector_quad acc0; | |||
| __builtin_mma_xxsetaccz (&acc0); | |||
| BLASLONG l = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| FLOAT t[4] = { 0, 0, 0, 0 }; | |||
| t[0] = BO[l << 1], t[1] = BO[(l << 1) + 1]; | |||
| __vector_pair rowB; | |||
| vec_t *rb = (vec_t *) & t[0]; | |||
| __builtin_mma_assemble_pair (&rowB, rb[1], rb[0]); | |||
| vec_t *rowA = (vec_t *) & AO[l << 1]; | |||
| __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); | |||
| } | |||
| SAVE2x4_ACC (&acc0, 0); | |||
| CO += 2; | |||
| AO += temp << 1; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 2) | |||
| #endif | |||
| } | |||
| i = (m & 1) >> 0; | |||
| for (j = 0; j < i; j++) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 2); | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowA = { AO[l], AO[l] }; | |||
| v4sf_t rowB = { BO[l << 1], BO[(l << 1) + 1] }; | |||
| t += rowA * rowB; | |||
| } | |||
| t = t * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0 * ldc] = t[0]; | |||
| CO[1 * ldc] = t[1]; | |||
| #else | |||
| CO[0 * ldc] += t[0]; | |||
| CO[1 * ldc] += t[1]; | |||
| #endif | |||
| CO += 1; | |||
| AO += temp; | |||
| BO += temp << 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 2) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 2; // number of values in A | |||
| #endif | |||
| B += k << 1; | |||
| } | |||
| N = (n & 1) >> 0; | |||
| for (i1 = 0; i1 < N; i1++) | |||
| { | |||
| BLASLONG i, temp; | |||
| #if defined(TRMMKERNEL) && defined(LEFT) | |||
| off = offset; | |||
| #endif | |||
| FLOAT *CO; | |||
| FLOAT *AO; | |||
| CO = C; | |||
| C += ldc; | |||
| AO = A; | |||
| i = m; | |||
| while (i >= 16) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (16, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| v4sf_t t2 = { 0, 0 }; | |||
| v4sf_t t3 = { 0, 0 }; | |||
| v4sf_t t4 = { 0, 0 }; | |||
| v4sf_t t5 = { 0, 0 }; | |||
| v4sf_t t6 = { 0, 0 }; | |||
| v4sf_t t7 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowB = { BO[l], BO[l] }; | |||
| v4sf_t rowA = { AO[l << 4], AO[(l << 4) + 1] }; | |||
| v4sf_t rowA1 = { AO[(l << 4) + 2], AO[(l << 4) + 3] }; | |||
| v4sf_t rowA2 = { AO[(l << 4) + 4], AO[(l << 4) + 5] }; | |||
| v4sf_t rowA3 = { AO[(l << 4) + 6], AO[(l << 4) + 7] }; | |||
| v4sf_t rowA4 = { AO[(l << 4) + 8], AO[(l << 4) + 9] }; | |||
| v4sf_t rowA5 = { AO[(l << 4) + 10], AO[(l << 4) + 11] }; | |||
| v4sf_t rowA6 = { AO[(l << 4) + 12], AO[(l << 4) + 13] }; | |||
| v4sf_t rowA7 = { AO[(l << 4) + 14], AO[(l << 4) + 15] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA1 * rowB; | |||
| t2 += rowA2 * rowB; | |||
| t3 += rowA3 * rowB; | |||
| t4 += rowA4 * rowB; | |||
| t5 += rowA5 * rowB; | |||
| t6 += rowA6 * rowB; | |||
| t7 += rowA7 * rowB; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| t2 = t2 * valpha; | |||
| t3 = t3 * valpha; | |||
| t4 = t4 * valpha; | |||
| t5 = t5 * valpha; | |||
| t6 = t6 * valpha; | |||
| t7 = t7 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t[0]; | |||
| CO[1] = t[1]; | |||
| CO[2] = t1[0]; | |||
| CO[3] = t1[1]; | |||
| CO[4] = t2[0]; | |||
| CO[5] = t2[1]; | |||
| CO[6] = t3[0]; | |||
| CO[7] = t3[1]; | |||
| CO[8] = t4[0]; | |||
| CO[9] = t4[1]; | |||
| CO[10] = t5[0]; | |||
| CO[11] = t5[1]; | |||
| CO[12] = t6[0]; | |||
| CO[13] = t6[1]; | |||
| CO[14] = t7[0]; | |||
| CO[15] = t7[1]; | |||
| #else | |||
| CO[0] += t[0]; | |||
| CO[1] += t[1]; | |||
| CO[2] += t1[0]; | |||
| CO[3] += t1[1]; | |||
| CO[4] += t2[0]; | |||
| CO[5] += t2[1]; | |||
| CO[6] += t3[0]; | |||
| CO[7] += t3[1]; | |||
| CO[8] += t4[0]; | |||
| CO[9] += t4[1]; | |||
| CO[10] += t5[0]; | |||
| CO[11] += t5[1]; | |||
| CO[12] += t6[0]; | |||
| CO[13] += t6[1]; | |||
| CO[14] += t7[0]; | |||
| CO[15] += t7[1]; | |||
| #endif | |||
| AO += temp << 4; | |||
| BO += temp; | |||
| CO += 16; | |||
| i -= 16; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (16, 1) | |||
| #endif | |||
| } | |||
| while (i >= 8) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (8, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| v4sf_t t2 = { 0, 0 }; | |||
| v4sf_t t3 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowB = { BO[l], BO[l] }; | |||
| v4sf_t rowA = { AO[l << 3], AO[(l << 3) + 1] }; | |||
| v4sf_t rowA1 = { AO[(l << 3) + 2], AO[(l << 3) + 3] }; | |||
| v4sf_t rowA2 = { AO[(l << 3) + 4], AO[(l << 3) + 5] }; | |||
| v4sf_t rowA3 = { AO[(l << 3) + 6], AO[(l << 3) + 7] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA1 * rowB; | |||
| t2 += rowA2 * rowB; | |||
| t3 += rowA3 * rowB; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| t2 = t2 * valpha; | |||
| t3 = t3 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t[0]; | |||
| CO[1] = t[1]; | |||
| CO[2] = t1[0]; | |||
| CO[3] = t1[1]; | |||
| CO[4] = t2[0]; | |||
| CO[5] = t2[1]; | |||
| CO[6] = t3[0]; | |||
| CO[7] = t3[1]; | |||
| #else | |||
| CO[0] += t[0]; | |||
| CO[1] += t[1]; | |||
| CO[2] += t1[0]; | |||
| CO[3] += t1[1]; | |||
| CO[4] += t2[0]; | |||
| CO[5] += t2[1]; | |||
| CO[6] += t3[0]; | |||
| CO[7] += t3[1]; | |||
| #endif | |||
| AO += temp << 3; | |||
| BO += temp; | |||
| CO += 8; | |||
| i -= 8; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (8, 1) | |||
| #endif | |||
| } | |||
| while (i >= 4) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (4, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| v4sf_t t1 = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowB = { BO[l], BO[l] }; | |||
| v4sf_t rowA = { AO[l << 2], AO[(l << 2) + 1] }; | |||
| v4sf_t rowA1 = { AO[(l << 2) + 2], AO[(l << 2) + 3] }; | |||
| t += rowA * rowB; | |||
| t1 += rowA1 * rowB; | |||
| } | |||
| t = t * valpha; | |||
| t1 = t1 * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t[0]; | |||
| CO[1] = t[1]; | |||
| CO[2] = t1[0]; | |||
| CO[3] = t1[1]; | |||
| #else | |||
| CO[0] += t[0]; | |||
| CO[1] += t[1]; | |||
| CO[2] += t1[0]; | |||
| CO[3] += t1[1]; | |||
| #endif | |||
| AO += temp << 2; | |||
| BO += temp; | |||
| CO += 4; | |||
| i -= 4; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (4, 1) | |||
| #endif | |||
| } | |||
| while (i >= 2) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (2, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| v4sf_t t = { 0, 0 }; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| v4sf_t rowB = { BO[l], BO[l] }; | |||
| v4sf_t rowA = { AO[l << 1], AO[(l << 1) + 1] }; | |||
| t += rowA * rowB; | |||
| } | |||
| t = t * valpha; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t[0]; | |||
| CO[1] = t[1]; | |||
| #else | |||
| CO[0] += t[0]; | |||
| CO[1] += t[1]; | |||
| #endif | |||
| AO += temp << 1; | |||
| BO += temp; | |||
| CO += 2; | |||
| i -= 2; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (2, 1) | |||
| #endif | |||
| } | |||
| while (i >= 1) | |||
| { | |||
| FLOAT *BO; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_POINTERS (1, 1) | |||
| #else | |||
| BO = B; | |||
| temp = k; | |||
| #endif | |||
| BLASLONG l = 0; | |||
| FLOAT t = 0; | |||
| for (l = 0; l < temp; l++) | |||
| { | |||
| t += AO[l] * BO[l]; | |||
| } | |||
| AO += temp; | |||
| BO += temp; | |||
| #if defined(TRMMKERNEL) | |||
| CO[0] = t * alpha; | |||
| #else | |||
| CO[0] += t * alpha; | |||
| #endif | |||
| CO += 1; | |||
| i -= 1; | |||
| #if defined(TRMMKERNEL) | |||
| REFRESH_AFTER_SAVE (1, 1) | |||
| #endif | |||
| } | |||
| #if defined(TRMMKERNEL) && !defined(LEFT) | |||
| off += 1; // number of values in A | |||
| #endif | |||
| B += k; | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #include "common.h" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "dgemv_n_microk_power8.c" | |||
| #endif | |||
| @@ -39,7 +39,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #pragma GCC optimize "O1" | |||
| #if defined(POWER8) || defined(POWER9) | |||
| #if defined(POWER8) || defined(POWER9) || defined(POWER10) | |||
| #include "drot_microk_power8.c" | |||
| #endif | |||